This project is about to mine new small business client leads to a Bank based on the features of already acquired small business clients.
notebook setup
knitr::opts_chunk$set(tidy=TRUE,options(warn=1),cache=TRUE)
chooseCRANmirror(graphics=FALSE,ind=1)
knitr::opts_knit$set(root.dir=getwd())
cleaning the workplace
rm(list = ls())
gc(reset = TRUE)
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 582187 31.1 940480 50.3 582187 31.1
Vcells 1417360 10.9 2377228 18.2 1417360 10.9
unlink(paste(getwd(), '/Client_Acquisition_for_a_Bank.nb.html', sep = ''), recursive = TRUE)
unlink(paste(getwd(), '/_results/BankiUgyfelModel', sep = ''), recursive = TRUE)
unlink(paste(getwd(), '/_results/BankiUgyfelModel.java', sep = ''), recursive = TRUE)
unlink(paste(getwd(), '/_results/h2o-genmodel.jar', sep = ''), recursive = TRUE)
setting general options
Sys.getlocale()
[1] "LC_CTYPE=en_US.UTF-8;LC_NUMERIC=C;LC_TIME=en_US.UTF-8;LC_COLLATE=en_US.UTF-8;LC_MONETARY=en_US.UTF-8;LC_MESSAGES=en_US.UTF-8;LC_PAPER=en_US.UTF-8;LC_NAME=C;LC_ADDRESS=C;LC_TELEPHONE=C;LC_MEASUREMENT=en_US.UTF-8;LC_IDENTIFICATION=C"
Sys.setlocale('LC_ALL','C')
[1] "LC_CTYPE=C;LC_NUMERIC=C;LC_TIME=C;LC_COLLATE=C;LC_MONETARY=C;LC_MESSAGES=en_US.UTF-8;LC_PAPER=en_US.UTF-8;LC_NAME=C;LC_ADDRESS=C;LC_TELEPHONE=C;LC_MEASUREMENT=en_US.UTF-8;LC_IDENTIFICATION=C"
options(StringAsFactor = FALSE)
print(paste('You are in', getwd(), 'directory. The intermediate materials and results will be placed here.', sep = ' '))
[1] "You are in /home/sbudai/Documents/projects/GitHub/Client_Acquisition_for_a_Bank directory. The intermediate materials and results will be placed here."
creating necessary subfolders
dir.create(file.path(getwd(), '_tmp'), showWarnings = FALSE)
dir.create(file.path(getwd(), '_results'), showWarnings = FALSE)
calling ‘install.load’ library and installing if required It contains a cool function for package installing/loading.
if(!'install.load' %in% rownames(installed.packages())) {
install.packages('install.load')
Sys.sleep(6)
}
library(install.load)
calling all the other necessary libraries and installing if required
install_load('formatR',
'data.table',
'sqldf',
'stringi',
'validate',
'ggplot2',
'ggthemes',
'simputation',
'knitr',
'xml2',
'reshape2',
'XML',
'pander',
'Imap',
'igraph',
'ggmap',
'randomForest',
'missForest',
'h2o',
'grid',
'plotly',
'htmlwidgets',
'openxlsx',
'RSelenium')
data.table 1.10.4
The fastest way to learn (by data.table authors): https://www.datacamp.com/courses/data-analysis-the-data-table-way
Documentation: ?data.table, example(data.table) and browseVignettes("data.table")
Release notes, videos and slides: http://r-datatable.com
Loading required package: gsubfn
Loading required package: proto
Loading required package: RSQLite
Attaching package: 'reshape2'
The following objects are masked from 'package:data.table':
dcast, melt
Attaching package: 'igraph'
The following object is masked from 'package:validate':
compare
The following objects are masked from 'package:stats':
decompose, spectrum
The following object is masked from 'package:base':
union
Google Maps API Terms of Service: http://developers.google.com/maps/terms.
Please cite ggmap if you use it: see citation('ggmap') for details.
randomForest 4.6-12
Type rfNews() to see new features/changes/bug fixes.
Attaching package: 'randomForest'
The following object is masked from 'package:simputation':
na.roughfix
The following object is masked from 'package:ggplot2':
margin
Loading required package: foreach
foreach: simple, scalable parallel programming from Revolution Analytics
Use Revolution R for scalability, fault tolerance and more.
http://www.revolutionanalytics.com
Loading required package: itertools
Loading required package: iterators
----------------------------------------------------------------------
Your next step is to start H2O:
> h2o.init()
For H2O package documentation, ask for help:
> ??h2o
After starting H2O, you can use the Web UI at http://localhost:54321
For more information visit http://docs.h2o.ai
----------------------------------------------------------------------
Attaching package: 'h2o'
The following objects are masked from 'package:data.table':
hour, month, week, year
The following objects are masked from 'package:stats':
cor, sd, var
The following objects are masked from 'package:base':
%*%, %in%, &&, apply, as.factor, as.numeric, colnames, colnames<-, ifelse, is.character, is.factor,
is.numeric, log, log10, log1p, log2, round, signif, trunc, ||
Attaching package: 'plotly'
The following object is masked from 'package:ggmap':
wind
The following objects are masked from 'package:igraph':
%>%, groups
The following object is masked from 'package:ggplot2':
last_plot
The following object is masked from 'package:stats':
filter
The following object is masked from 'package:graphics':
layout
Sys.sleep(3)
keys <- data.table(fread('~/Documents/creds.csv'))
keys <- keys[ProjectTitle == 'Client_Acquisition_for_a_Bank', (2:3), with = FALSE]
creating function to display graphs on one page
# http://www.cookbook-r.com/Graphs/Multiple_graphs_on_one_page_(ggplot2)/
## Multiple plot function
##
## ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
## - cols: Number of columns in layout
## - layout: A matrix specifying the layout. If present, 'cols' is ignored.
##
## If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
## then plot 1 will go in the upper left, 2 will go in the upper right, and
## 3 will go all the way across the bottom.
####
# multiplot <- function(..., plotlist = NULL, file, cols = 1, layout = NULL) {
# # Make a list from the ... arguments and plotlist
# plots <- c(list(...), plotlist)
# numPlots = length(plots)
# # If layout is NULL, then use 'cols' to determine layout
# if (is.null(layout)) {
# # Make the panel
# # ncol: Number of columns of plots
# # nrow: Number of rows needed, calculated from # of cols
# layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
# ncol = cols,
# nrow = ceiling(numPlots/cols))
# }
# if (numPlots == 1) {
# print(plots[[1]])
# } else {
# # Set up the page
# grid.newpage()
# pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
# # Make each plot, in the correct location
# for (i in 1:numPlots) {
# # Get the i,j matrix positions of the regions that contain this subplot
# matchidx <- as.data.table(which(layout == i, arr.ind = TRUE))
# print(plots[[i]], vp = viewport(layout.pos.row = matchidx[, row],
# layout.pos.col = matchidx[, col]))
# }
# }
#}
importing all companies’ list
file_in = paste('https://www.dropbox.com/s/', keys[AppName == 'felveteli_feladat_v2', 2], '?raw=1', sep = '')
company_list <- fread(file_in,
sep = ';',
na.strings = c('na', 'n/a', 'NA', 'N/A', ''),
strip.white = TRUE,
integer64 = 'numeric',
colClasses = list(character = 1, 5, 6, 7, 12, 14))
% Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0
0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0
0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0
100 97487 0 97487 0 0 62949 0 --:--:-- 0:00:01 --:--:-- 62949
100 2222k 0 2222k 0 0 876k 0 --:--:-- 0:00:02 --:--:-- 2151k
100 4334k 0 4334k 0 0 1225k 0 --:--:-- 0:00:03 --:--:-- 2131k
100 5902k 0 5902k 0 0 1301k 0 --:--:-- 0:00:04 --:--:-- 1943k
100 7662k 0 7662k 0 0 1383k 0 --:--:-- 0:00:05 --:--:-- 1897k
100 9454k 0 9454k 0 0 1446k 0 --:--:-- 0:00:06 --:--:-- 1876k
100 10.8M 0 10.8M 0 0 1478k 0 --:--:-- 0:00:07 --:--:-- 1784k
100 12.4M 0 12.4M 0 0 1491k 0 --:--:-- 0:00:08 --:--:-- 1679k
100 14.1M 0 14.1M 0 0 1518k 0 --:--:-- 0:00:09 --:--:-- 1714k
100 15.9M 0 15.9M 0 0 1548k 0 --:--:-- 0:00:10 --:--:-- 1731k
100 16.2M 0 16.2M 0 0 1552k 0 --:--:-- 0:00:10 --:--:-- 1719k
Read 84.9% of 435704 rows
Read 435704 rows and 14 (of 14) columns from 0.051 GB file in 00:00:03
let’s bring column names into a general name convention
colnames(company_list) <- gsub(' ', '', stri_trans_totitle(gsub('_', ' ', stri_trans_general(colnames(company_list), 'Latin-ASCII'))))
company_list[, ':=' (FotevekenysegText = stri_trans_general(FotevekenysegText , 'Latin-ASCII'),
Szekhely = stri_trans_general(Szekhely, 'Latin-ASCII'),
SzekhelyVaros = stri_trans_general(SzekhelyVaros, 'Latin-ASCII'))]
saving
file_out = paste(getwd(), '/_tmp/company_list.RData', sep = '')
fwrite(company_list,
file = file_out,
nThread = getDTthreads())
let’s check a sample of the data
pander(company_list[sample(.N, 5)], split.table = 200)
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Adoszam SajatToke MerlegEredmeny Arbevetel Szekhely SzekhelyVaros FotevekenysegText Letszam Cegkora AktivPrivatTulajdonos AktivTulajdonosSzam Cegforma
--------- ----------- ---------------- ----------- ------------------------ --------------- ----------------------------- --------- --------- ----------------------- --------------------- ----------
11555535 81903000 10204000 84650000 4024 DEBRECEN VAR 10 DEBRECEN Kereskedelem, gepjarmujavitas NA 19 2 2 KFT
10690011 -3099000 -2012000 10085000 1147 BUDAPEST FURESZ 111 BUDAPEST Adminisztrativ es 2 25 2 2 KFT
szolgaltatast tamogato
tevekenyseg
24859705 0 0 0 1132 BUDAPEST VACI 64 BUDAPEST Szallashely-szolgaltatas, NA 3 1 1 KFT
vendeglatas
20024389 1517000 0 0 7631 PECS MEGYERI 64 PECS Szakmai, tudomanyos, muszaki NA 25 2 2 NA
tevekenyseg
23919662 4036000 340000 11401000 6300 KALOCSA ERDEI 54 KALOCSA Ingatlanugyletek NA 4 2 2 KFT
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Table: Table continues below
-----------------------------
BankiUgyfel AdoszamHosszu
------------- ---------------
NA 11555535-2-09
NA 10690011-2-42
NA 24859705-2-41
NA 20024389-2-02
NA 23919662-2-03
-----------------------------
importing owner companies’ list
file_in = paste('https://www.dropbox.com/s/', keys[AppName == 'felveteli_feladat_tulajok_listaja', 2], '?raw=1', sep = '')
owner_company_list <- fread(file_in,
sep = ';',
na.strings = c('na', 'n/a', 'NA', 'N/A', ''),
strip.white = TRUE,
colClasses = list(character = 1, 2))
% Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0
0 0 0 0 0 0 0 0 --:--:-- 0:00:01 --:--:-- 0
0 0 0 0 0 0 0 0 --:--:-- 0:00:01 --:--:-- 0
0 0 0 0 0 0 0 0 --:--:-- 0:00:01 --:--:-- 0
100 15550 0 15550 0 0 7642 0 --:--:-- 0:00:02 --:--:-- 7642
100 275k 0 275k 0 0 127k 0 --:--:-- 0:00:02 --:--:-- 1959k
let’s bring column names into a general name convention
colnames(owner_company_list) <- gsub(' ', '', stri_trans_totitle(gsub('_', ' ', stri_trans_general(colnames(owner_company_list), 'Latin-ASCII'))))
saving
file_out = paste(getwd(), '/_tmp/owner_company_list.RData', sep = '')
fwrite(owner_company_list,
file = file_out,
nThread = getDTthreads())
let’s check a sample of the data
pander(owner_company_list[sample(.N, 5)], split.table = 200)
------------------------
Adoszam AdoszamTulaj
--------- --------------
12287866 23277524
23826551 23814323
10752337 10458923
25365375 11750170
22979924 14083754
------------------------
importing postcode list of bank branches
file_in = paste('https://www.dropbox.com/s/', keys[AppName == 'irszamlista', 2], '?raw=1', sep = '')
branch_zip_list <- fread(file_in,
sep = ';',
na.strings = c('na', 'n/a', 'NA', 'N/A', ''),
strip.white = TRUE,
colClasses = list(character = 1))
% Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0
0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0
100 203 0 203 0 0 246 0 --:--:-- --:--:-- --:--:-- 246
100 203 0 203 0 0 246 0 --:--:-- --:--:-- --:--:-- 0
setnames(branch_zip_list, 'irányÃtószám', 'IrSzam')
saving
file_out = paste(getwd(), '/_tmp/branch_zip_list.RData', sep = '')
fwrite(branch_zip_list,
file = file_out,
nThread = getDTthreads())
let’s check a sample of the data
pander(branch_zip_list[sample(.N, 5)], split.table = 200)
--------
IrSzam
--------
6065
6334
6133
6237
6413
--------
initial summary of company list - let’s check a sample of the data
pander(summary(company_list), split.table = 200)
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Adoszam SajatToke MerlegEredmeny Arbevetel Szekhely SzekhelyVaros FotevekenysegText Letszam Cegkora AktivPrivatTulajdonos
---------------- ------------------ ------------------ ------------------ ---------------- ---------------- ------------------- --------------- -------------- -----------------------
Length:435704 Min. :-3.787e+10 Min. :-6.802e+10 Min. :-8.941e+08 Length:435704 Length:435704 Length:435704 Min. : 1.00 Min. : 0.00 Min. : 0.000
Class :character 1st Qu.: 0.000e+00 1st Qu.:-7.200e+04 1st Qu.: 9.000e+03 Class :character Class :character Class :character 1st Qu.: 1.00 1st Qu.: 5.00 1st Qu.: 1.000
Mode :character Median : 2.861e+06 Median : 2.600e+04 Median : 5.525e+06 Mode :character Mode :character Mode :character Median : 2.00 Median : 9.00 Median : 2.000
NA Mean : 9.321e+07 Mean : 1.042e+07 Mean : 1.681e+08 NA NA NA Mean : 14.08 Mean : 11.26 Mean : 2.014
NA 3rd Qu.: 1.051e+07 3rd Qu.: 1.120e+06 3rd Qu.: 2.921e+07 NA NA NA 3rd Qu.: 4.00 3rd Qu.: 17.00 3rd Qu.: 2.000
NA Max. : 3.498e+12 Max. : 2.866e+12 Max. : 4.396e+12 NA NA NA Max. :31646.00 Max. :116.00 Max. :908.000
NA NA NA NA NA NA NA NA's :313007 NA's :384 NA's :16318
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Table: Table continues below
---------------------------------------------------------------------
AktivTulajdonosSzam Cegforma BankiUgyfel AdoszamHosszu
--------------------- ---------------- ------------- ----------------
Min. : 1.000 Length:435704 Min. :1 Length:435704
1st Qu.: 1.000 Class :character 1st Qu.:1 Class :character
Median : 2.000 Mode :character Median :1 Mode :character
Mean : 2.125 NA Mean :1 NA
3rd Qu.: 2.000 NA 3rd Qu.:1 NA
Max. :914.000 NA Max. :1 NA
NA's :16318 NA NA's :430619 NA
---------------------------------------------------------------------
prerequisite of detection of Adoszam values being unique is to order them by that column
company_list <- company_list[order(Adoszam)]
let’s check column values validity
ct <- check_that(company_list,
valid_notation_BankiUgyfel = BankiUgyfel == 0
| BankiUgyfel == 1,
distinct_Adoszam = rle(sort(Adoszam))[[1]] == 1,
complete_Adoszam = stri_length(Adoszam) == 8,
complete_AdoszamHosszu = stri_length(AdoszamHosszu) == 13,
identical_Adoszam_and_AdoszamHosszu = Adoszam == substr(AdoszamHosszu,1 , 8),
initcap_Cegforma = Cegforma == stri_trans_totitle(Cegforma),
valid_AktivTulajdonosSzam = AktivTulajdonosSzam >= 1,
valid_AktivPrivatTulajdonos = AktivTulajdonosSzam >= AktivPrivatTulajdonos,
filled_AktivTulajdonosSzam_and_AktivPrivatTulajdonos = stri_length(AktivTulajdonosSzam) >= 1
& stri_length(AktivPrivatTulajdonos) >= 1,
not_negative_Cegkora = Cegkora >= 0,
under_UCL_Cegkora = Cegkora <= mean(Cegkora, na.rm = TRUE) + 3 * sd(Cegkora, na.rm = TRUE),
valid_Letszam = Letszam >= 1,
filled_FotevekenysegText = stri_length(FotevekenysegText) >= 1,
initcap_FotevekenysegText = FotevekenysegText == stri_trans_totitle(FotevekenysegText),
valid_postcode = nchar(sub(' .*', '', stri_trim(Szekhely))) == 4,
valid_SzekhelyVaros = stri_length(SzekhelyVaros) >= 2
& grepl('\\d', SzekhelyVaros) == FALSE,
initcap_SzekhelyVaros = SzekhelyVaros == stri_trans_totitle(SzekhelyVaros),
valid_Szekhely = stri_length(Szekhely) >= 11
& grepl('\\D', substr(stri_trim(Szekhely), 1, 4)) == FALSE
& grepl('\\^', Szekhely) == FALSE,
initcap_Szekhely = Szekhely == stri_trans_totitle(Szekhely),
filled_Arbevetel = stri_length(Arbevetel) >= 1,
filled_MerlegEredmeny = stri_length(MerlegEredmeny) >= 1,
filled_SajatToke = stri_length(SajatToke) >= 1,
filled_FotevekenysegText_and_AktivTulajdonosSzam_and_Cegforma = stri_length(FotevekenysegText) >= 1
& stri_length(AktivTulajdonosSzam) >= 1
& stri_length(Cegforma) >= 1
)
dt <- data.table(summary(ct))
dt <- dt[, c(1:5, 8)]
let’s see the validity of the data
pander(dt, split.table = 200)
-----------------------------------------------------------------------------------------------------------------------------------
rule items passes fails nNA expression
------------------------------------------------------------- ------- -------- ------- ------ -------------------------------------
valid_notation_BankiUgyfel 435704 5085 0 430619 abs(BankiUgyfel - 0) < 1e-08 |
abs(BankiUgyfel - 1) < 1e-08
distinct_Adoszam 435439 435229 210 0 rle(sort(Adoszam))[[1]] == 1
complete_Adoszam 435704 435704 0 0 stri_length(Adoszam) == 8
complete_AdoszamHosszu 435704 435219 24 461 stri_length(AdoszamHosszu) ==
13
identical_Adoszam_and_AdoszamHosszu 435704 435112 131 461 Adoszam ==
substr(AdoszamHosszu, 1, 8)
initcap_Cegforma 435704 0 400465 35239 Cegforma ==
stri_trans_totitle(Cegforma)
valid_AktivTulajdonosSzam 435704 419386 0 16318 AktivTulajdonosSzam >= 1
valid_AktivPrivatTulajdonos 435704 419386 0 16318 AktivTulajdonosSzam >=
AktivPrivatTulajdonos
filled_AktivTulajdonosSzam_and_AktivPrivatTulajdonos 435704 419386 0 16318 stri_length(AktivTulajdonosSzam)
>= 1 &
stri_length(AktivPrivatTulajdonos)
>= 1
not_negative_Cegkora 435704 435320 0 384 Cegkora >= 0
under_UCL_Cegkora 435704 434401 919 384 Cegkora <= mean(Cegkora, na.rm
= TRUE) + 3 * sd(Cegkora,
na.rm = TRUE)
valid_Letszam 435704 122697 0 313007 Letszam >= 1
filled_FotevekenysegText 435704 370787 0 64917 stri_length(FotevekenysegText)
>= 1
initcap_FotevekenysegText 435704 99980 270807 64917 FotevekenysegText ==
stri_trans_totitle(FotevekenysegText)
valid_postcode 435704 435694 9 1 nchar(sub(" .*", "",
stri_trim(Szekhely))) == 4
valid_SzekhelyVaros 435704 435700 3 1 stri_length(SzekhelyVaros) >=
2 & grepl("\\d",
SzekhelyVaros) == FALSE
initcap_SzekhelyVaros 435704 7 435696 1 SzekhelyVaros ==
stri_trans_totitle(SzekhelyVaros)
valid_Szekhely 435704 435691 12 1 stri_length(Szekhely) >= 11 &
grepl("\\D",
substr(stri_trim(Szekhely), 1,
4)) == FALSE & grepl("\\^",
Szekhely) == FALSE
initcap_Szekhely 435704 5 435698 1 Szekhely ==
stri_trans_totitle(Szekhely)
filled_Arbevetel 435704 435704 0 0 stri_length(Arbevetel) >= 1
filled_MerlegEredmeny 435704 435704 0 0 stri_length(MerlegEredmeny) >=
1
filled_SajatToke 435704 435704 0 0 stri_length(SajatToke) >= 1
filled_FotevekenysegText_and_AktivTulajdonosSzam_and_Cegforma 435704 328402 0 107302 stri_length(FotevekenysegText)
>= 1 &
stri_length(AktivTulajdonosSzam)
>= 1 & stri_length(Cegforma)
>= 1
-----------------------------------------------------------------------------------------------------------------------------------
Adoszam is not distinct in the table, so I drop the repeating lines and double check it
company_list <- data.table(unique(company_list))
prerequisite of detection of Adoszam values being unique is to order them by that column
company_list <- company_list[order(Adoszam)]
ct <- check_that(company_list,
distinct_Adoszam = rle(sort(Adoszam))[[1]] == 1)
dt <- data.table(summary(ct))
dt <- dt[, c(1:5, 8)]
let’s see the validity of the data
pander(dt, split.table = 200)
----------------------------------------------------------------------------
rule items passes fails nNA expression
---------------- ------- -------- ------- ----- ----------------------------
distinct_Adoszam 435439 435439 0 0 rle(sort(Adoszam))[[1]] == 1
----------------------------------------------------------------------------
It seems ok. We got rid of duplicates.
let’s drop commas from Szekhely and SzekhelyVaros columns & check a sample of the data
pander(company_list[grepl(',', Szekhely) | grepl(',', SzekhelyVaros), .(Adoszam, Szekhely, SzekhelyVaros)])
------------------------------------------------------------
Adoszam Szekhely SzekhelyVaros
--------- ------------------------------ -------------------
10118720 0, Budapest, V., Aranykez u. dapest,
2.
10223783 0, Budapest, XII.ker. Diosarok dapest,
ut 62.
10386543 114, Budapest, Villanyi ut Budapest,
11-13.
11541082 0, Pecs, Szarvas dulo 15. �cs,
12291625 0, Sajoszentpeter, Jozsef A. joszentpeter,
u. 42.
20009946 0, Pecs, Fellbach �cs,
Bevasarlokozpont
22810773 0, Debrecen-Erdospuszta, brecen-Erdospuszta,
Vekeri-piheno
23660258 0, Tatabanya, II., Sarberki tabanya,
uzeletsor
29628203 0, Nagykoros, Barany u. 5/b. gykoros,
29814730 0, Budapest, VII.ker. Izabella dapest,
u. 36-38.
------------------------------------------------------------
company_list[grepl(',', Szekhely), Szekhely := gsub(',', '', Szekhely)]
company_list[grepl(',', SzekhelyVaros), SzekhelyVaros := gsub(',', '', SzekhelyVaros)]
let’s transform text columns to initial capitals
company_list[Cegforma != stri_trans_totitle(Cegforma), Cegforma := stri_trans_totitle(Cegforma)]
company_list[FotevekenysegText != stri_trans_totitle(FotevekenysegText), FotevekenysegText := stri_trans_totitle(FotevekenysegText)]
company_list[SzekhelyVaros != stri_trans_totitle(SzekhelyVaros), SzekhelyVaros := stri_trans_totitle(SzekhelyVaros)]
company_list[Szekhely != stri_trans_totitle(Szekhely), Szekhely := stri_trans_totitle(Szekhely)]
let’s transform the dependent variable (BankiUgyfel) into a real dummy variable
company_list[BankiUgyfel == 1, BankiUgyfel_ := TRUE]
company_list[is.na(BankiUgyfel), BankiUgyfel_ := FALSE]
company_list[, BankiUgyfel := NULL]
setnames(company_list, 'BankiUgyfel_', 'BankiUgyfel')
let’s check a sample of the data
pander(company_list[, .N, by = BankiUgyfel], split.table = 200)
--------------------
BankiUgyfel N
------------- ------
FALSE 430619
TRUE 4820
--------------------
there are some companies with not matching Adoszam and AdoszamHosszu / I drop those AdoszamHosszu data
n <- company_list[Adoszam != substr(AdoszamHosszu,1 , 8), .N]
pander(paste('The number of not matching AdoszamHosszu data: ', n, sep = ''))
The number of not matching AdoszamHosszu data: 131
let’s check a sample of the data
pander(company_list[Adoszam != substr(AdoszamHosszu,1 , 8), .(Adoszam, AdoszamHosszu, Szekhely)][sample(.N, 5)], split.table = 200)
---------------------------------------------------
Adoszam AdoszamHosszu Szekhely
--------- --------------- -------------------------
10959493 10785052-1-43 1111 Budapest Bartok 36
24465207 24460185-2-41 1011 Budapest Jegverem 8
10936272 10930272-1-43 1118 Budapest Menesi 36
12180075 12180675-2-41 1037 Budapest Fergeteg 13
28274193 28103972-2-42 1011 Budapest Maria 3
---------------------------------------------------
dropping not matching AdoszamHosszu values
company_list[Adoszam != substr(AdoszamHosszu,1 , 8), AdoszamHosszu := NA]
there are some companies with not complete AdoszamHosszu / I drop those AdoszamHosszu values
n <- company_list[stri_length(AdoszamHosszu) < 13, .N]
pander(paste('The number of not complete AdoszamHosszu data: ', n, sep = ''))
The number of not complete AdoszamHosszu data: 19
let’s check a sample of the data
pander(company_list[stri_length(AdoszamHosszu) < 13, .(Adoszam, AdoszamHosszu, Szekhely)][sample(.N, 5)], split.table = 200)
---------------------------------------------------
Adoszam AdoszamHosszu Szekhely
--------- --------------- -------------------------
10894576 10894576-0-1 2367 Ujhartyan Malom 22
23619812 23619812-1-1 2521 Csolnok Szedres 5
11221492 11221492-1-4 8700 Marcali Kossuth 74
20006802 20006802-0-2 7900 Szigetvar Fertokoz 8
11430384 11430384-0-4 5900 Oroshaza Csorvasi 26
---------------------------------------------------
dropping nonsense AdoszamHosszu values
company_list[stri_length(AdoszamHosszu) < 13, AdoszamHosszu := NA]
there are some probable outliers in terms of CegKora / I have double checked, there could be companies with this age out there
dt <- company_list[Cegkora > mean(Cegkora, na.rm = TRUE) + 3 * sd(Cegkora, na.rm = TRUE), .N, by = Cegkora]
dt[order(-Cegkora)]
there are some FotevekenysegText values are missing / I will fill them up with as NemIsmert & collapse value strings
n <- company_list[is.na(FotevekenysegText), .N]
pander(paste('The number of missing FotevekenysegText values: ', n, sep = ''))
The number of missing FotevekenysegText values: 64885
company_list[is.na(FotevekenysegText), FotevekenysegText := 'NemIsmert']
company_list[grepl(',', FotevekenysegText), FotevekenysegText := gsub(',', '', FotevekenysegText)]
company_list[grepl(' ', FotevekenysegText), FotevekenysegText := gsub(' ', '', FotevekenysegText)]
dt <- data.table(company_list[, .N, by = FotevekenysegText])
let’s check a sample of the data
pander(dt[order(-N, FotevekenysegText)], split.table = 200)
------------------------------------------------------
FotevekenysegText N
------------------------------------------------ -----
KereskedelemGepjarmujavitas 88193
NemIsmert 64885
SzakmaiTudomanyosMuszakiTevekenyseg 58536
Epitoipar 32956
Feldolgozoipar 31117
Ingatlanugyletek 28767
InformacioKommunikacio 20972
AdminisztrativEsSzolgaltatastTamogatoTevekenyseg 20655
Szallashely-SzolgaltatasVendeglatas 18166
Human-EgeszsegugyiSzocialisEllatas 13079
SzallitasRaktarozas 12977
MezogazdasagErdogazdalkodasHalaszat 10592
PenzugyiBiztositasiTevekenyseg 9558
EgyebSzolgaltatas 7478
MuveszetSzorakoztatasSzabadido 7322
Oktatas 7081
VizellatasSzennyviz-Hulladekgazd. 1558
VillamosenergiaGaz-HozellatasLegkondicionalas 897
BanyaszatKofejtes 429
KozigazgatasVedelemKotelezoTarsadalombiztositas 207
HaztartasMunkaadoiTevekenysege 13
TeruletenKivuliSzervezet 1
------------------------------------------------------
there are some not valid Szekhely values / I drop those Szekhely values
n <- company_list[stri_length(Szekhely) < 11
| grepl('\\D', substr(stri_trim(Szekhely), 1, 4))
| grepl('\\^', Szekhely),
.N]
pander(paste('The number of not valid Szekhely values: ', n, sep = ''))
The number of not valid Szekhely values: 12
let’s check a sample of the data
pander(company_list[stri_length(Szekhely) < 11
| grepl('\\D', substr(stri_trim(Szekhely), 1, 4))
| grepl('\\^', Szekhely)
, .(Adoszam, Szekhely, SzekhelyVaros)], split.table = 200)
----------------------------------------------------------
Adoszam Szekhely SzekhelyVaros
--------- ----------------------------- ------------------
10118720 0 Budapest V. Aranykez U. 2. Dapest
10223783 0 Budapest Xii.ker. Diosarok Dapest
Ut 62.
10386543 114 Budapest Villanyi Ut Budapest
11-13.
11541082 0 Pecs Szarvas Dulo 15. �Cs
12291625 0 Sajoszentpeter Jozsef A. U. Joszentpeter
42.
14709070 7630 P E 5 P
14792335 6500 B A 3 B
20009946 0 Pecs Fellbach �Cs
Bevasarlokozpont
22810773 0 Debrecen-Erdospuszta Brecen-Erdospuszta
Vekeri-Piheno
23660258 0 Tatabanya Ii. Sarberki Tabanya
Uzeletsor
29628203 0 Nagykoros Barany U. 5/B. Gykoros
29814730 0 Budapest Vii.ker. Izabella Dapest
U. 36-38.
----------------------------------------------------------
dropping nonsense Szekhely values
company_list[stri_length(Szekhely) < 11
| grepl('\\D', substr(stri_trim(Szekhely), 1, 4))
| grepl('\\^', Szekhely)
, Szekhely := NA]
there are some not valid SzekhelyVaros values / I drop those SzekhelyVaros values
n <- company_list[stri_length(stri_enc_toutf8(SzekhelyVaros, validate = TRUE)) < 2
| grepl('\\d', stri_enc_toutf8(SzekhelyVaros, validate = TRUE)), .N]
pander(paste('The number of not valid SzekhelyVaros values: ', n, sep = ''))
The number of not valid SzekhelyVaros values: 3
let’s check a sample of the data
pander(company_list[stri_length(stri_enc_toutf8(SzekhelyVaros, validate = TRUE)) < 2
| grepl('\\d', stri_enc_toutf8(SzekhelyVaros, validate = TRUE))
, .(Adoszam, Szekhely, stri_enc_toutf8(SzekhelyVaros, validate = TRUE))], split.table = 200)
---------------------------------------------
Adoszam Szekhely V3
--------- ------------------------- ---------
10932807 1047 Budapest6 Foti 56 Budapest6
12769621 2009 0201Pest Bocskai 104 0201Pest
23817797 7960 7960 Dravasztara 25 7960
---------------------------------------------
dropping nonsense SzekhelyVaros values
company_list[stri_length(stri_enc_toutf8(SzekhelyVaros, validate = TRUE)) < 2
| grepl('\\d', stri_enc_toutf8(SzekhelyVaros, validate = TRUE))
, SzekhelyVaros := NA]
saving modified company_list in RData format
file_out = paste(getwd(), '/_tmp/company_list.RData', sep = '')
fwrite(company_list,
file = file_out,
nThread = getDTthreads())
let’s check a sample of the data
pander(company_list[sample(.N, 5)], split.table = 200)
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Adoszam SajatToke MerlegEredmeny Arbevetel Szekhely SzekhelyVaros FotevekenysegText Letszam Cegkora AktivPrivatTulajdonos
--------- ----------- ---------------- ----------- ----------------------------- ------------------- ------------------------------------------------ --------- --------- -----------------------
21503041 56000 0 0 2146 Mogyorod Szolo 6 Mogyorod EgyebSzolgaltatas NA 14 2
12536036 5521000 2142000 8650000 8900 Zalaegerszeg Jegmadar 16 Zalaegerszeg AdminisztrativEsSzolgaltatastTamogatoTevekenyseg NA 16 1
14221477 -390000 103000 672000 2509 Esztergom-Kertvaros Esztergom-Kertvaros NemIsmert NA 9 2
Wesselenyi 11
24110361 -8234000 635000 36583000 3070 Batonyterenye Batonyterenye Epitoipar 4 4 NA
Szorospataki 31
10696718 14593000 3101000 15512000 4400 Nyiregyhaza Szegfu 48 Nyiregyhaza AdminisztrativEsSzolgaltatastTamogatoTevekenyseg 1 25 2
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Table: Table continues below
--------------------------------------------------------------
AktivTulajdonosSzam Cegforma AdoszamHosszu BankiUgyfel
--------------------- ---------- --------------- -------------
2 Bt 21503041-1-13 FALSE
1 Kft 12536036-2-20 FALSE
2 Kft 14221477-2-11 FALSE
NA Kft 24110361-2-12 FALSE
2 Kft 10696718-2-15 FALSE
--------------------------------------------------------------
Let’s take a look at histograms of columns (variables) of company list
p1 <- ggplot(company_list) +
geom_histogram(aes(x = SajatToke),
fill = 'blue',
position = 'identity',
binwidth = 0.3,
alpha = I(0.7)) +
ggtitle('Histogram for log10_SajatToke') +
facet_grid(BankiUgyfel ~ ., scales = 'free', labeller = labeller(BankiUgyfel = c('TRUE' = 'Banki Ugyfel',
'FALSE' = 'Not Banki Ugyfel'))) +
xlab('log10_SajatToke') +
scale_x_log10() +
theme_igray()
Sys.sleep(2)
p2 <- ggplot(company_list) +
geom_histogram(aes(x = MerlegEredmeny),
fill = 'green',
position = 'identity',
binwidth = 0.3,
alpha = I(0.7)) +
ggtitle('Histogram for log10_MerlegEredmeny') +
facet_grid(BankiUgyfel ~ ., scales = 'free', labeller = labeller(BankiUgyfel = c('TRUE' = 'Banki Ugyfel',
'FALSE' = 'Not Banki Ugyfel'))) +
xlab('log10_MerlegEredmeny') +
scale_x_log10() +
theme_igray()
Sys.sleep(2)
p3 <- ggplot(company_list) +
geom_histogram(aes(x = Arbevetel),
fill = 'orange',
position = 'identity',
binwidth = 0.3,
alpha = I(0.7)) +
ggtitle('Histogram for log10_Arbevetel') +
facet_grid(BankiUgyfel ~ ., scales = 'free', labeller = labeller(BankiUgyfel = c('TRUE' = 'Banki Ugyfel',
'FALSE' = 'Not Banki Ugyfel'))) +
xlab('log10_Arbevetel') +
scale_x_log10() +
theme_igray()
Sys.sleep(2)
p4 <- ggplot(company_list) +
geom_histogram(aes(x = AktivPrivatTulajdonos),
fill = 'red',
position = 'identity',
binwidth = 0.1,
alpha = I(0.7)) +
ggtitle('Histogram for log10_AktivPrivatTulajdonos') +
facet_grid(BankiUgyfel ~ ., scales = 'free', labeller = labeller(BankiUgyfel = c('TRUE' = 'Banki Ugyfel',
'FALSE' = 'Not Banki Ugyfel'))) +
xlab('log10_AktivPrivatTulajdonos') +
scale_x_log10() +
theme_igray()
Sys.sleep(2)
p5 <- ggplot(company_list) +
geom_histogram(aes(x = AktivTulajdonosSzam),
fill = 'lightgreen',
position = 'identity',
binwidth = 0.1,
alpha = I(0.7)) +
ggtitle('Histogram for log10_AktivTulajdonosSzam') +
facet_grid(BankiUgyfel ~ ., scales = 'free', labeller = labeller(BankiUgyfel = c('TRUE' = 'Banki Ugyfel',
'FALSE' = 'Not Banki Ugyfel'))) +
xlab('log10_AktivTulajdonosSzam') +
scale_x_log10() +
theme_igray()
Sys.sleep(2)
p6 <- ggplot(company_list) +
geom_histogram(aes(x = Cegkora),
fill = 'brown',
position = 'identity',
binwidth = 0.2,
alpha = I(0.7)) +
ggtitle('Histogram for log10_Cegkora') +
facet_grid(BankiUgyfel ~ ., scales = 'free', labeller = labeller(BankiUgyfel = c('TRUE' = 'Banki Ugyfel',
'FALSE' = 'Not Banki Ugyfel'))) +
xlab('log10_Cegkora') +
scale_x_log10() +
theme_igray()
Sys.sleep(2)
p7 <- ggplot(company_list) +
geom_histogram(aes(x = Letszam),
fill = 'orange',
position = 'identity',
binwidth = 0.3,
alpha = I(0.7)) +
ggtitle('Histogram for log10_Letszam') +
facet_grid(BankiUgyfel ~ ., scales = 'free', labeller = labeller(BankiUgyfel = c('TRUE' = 'Banki Ugyfel',
'FALSE' = 'Not Banki Ugyfel'))) +
xlab('log10_Letszam') +
scale_x_log10() +
theme_igray()
Sys.sleep(2)
p8 <- ggplot(company_list) +
geom_bar(aes(x = as.factor(Cegforma)),
fill = 'blue',
position = 'identity',
alpha = I(0.7)) +
facet_grid(BankiUgyfel ~ ., scales = 'free', labeller = labeller(BankiUgyfel = c('TRUE' = 'Banki Ugyfel',
'FALSE' = 'Not Banki Ugyfel'))) +
ggtitle('Breakdown of Cegforma') +
xlab('Cegforma') +
theme_igray()
Sys.sleep(2)
p9 <- ggplot(company_list) +
geom_bar(aes(x = as.factor(FotevekenysegText)),
fill = 'darkgreen',
position = 'identity',
alpha = I(0.7)) +
facet_grid(BankiUgyfel ~ ., scales = 'free', labeller = labeller(BankiUgyfel = c('TRUE' = 'Banki Ugyfel',
'FALSE' = 'Not Banki Ugyfel'))) +
ggtitle('Breakdown of FotevekenysegText') +
xlab('FotevekenysegText') +
theme_igray()
Sys.sleep(2)
p10 <- ggplot(company_list) +
geom_bar(aes(x = as.factor(BankiUgyfel)),
fill = 'lightblue',
position = 'identity',
alpha = I(0.7)) +
ggtitle('Breakdown of BankiUgyfel') +
xlab('BankiUgyfel') +
theme_igray()
Sys.sleep(2)
without plotly in one grid:
# multiplot(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, cols = 2)
transforming into interactive graph
pp1 <- ggplotly(p1)
Sys.sleep(2)
saving
file_out = paste(getwd(), '/pp1.html', sep = '')
htmlwidgets::saveWidget(as.widget(pp1), file_out)
transforming into interactive graph
pp2 <- ggplotly(p2)
Sys.sleep(2)
saving
file_out = paste(getwd(), '/pp2.html', sep = '')
htmlwidgets::saveWidget(as.widget(pp2), file_out)
transforming into interactive graph
pp3 <- ggplotly(p3)
Sys.sleep(2)
saving
file_out = paste(getwd(), '/pp3.html', sep = '')
htmlwidgets::saveWidget(as.widget(pp3), file_out)
transforming into interactive graph
pp4 <- ggplotly(p4)
Sys.sleep(2)
saving
file_out = paste(getwd(), '/pp4.html', sep = '')
htmlwidgets::saveWidget(as.widget(pp4), file_out)
transforming into interactive graph
pp5 <- ggplotly(p5)
Sys.sleep(2)
saving
file_out = paste(getwd(), '/pp5.html', sep = '')
htmlwidgets::saveWidget(as.widget(pp5), file_out)
transforming into interactive graph
pp6 <- ggplotly(p6)
Sys.sleep(2)
saving
file_out = paste(getwd(), '/pp6.html', sep = '')
htmlwidgets::saveWidget(as.widget(pp6), file_out)
transforming into interactive graph
pp7 <- ggplotly(p7)
Sys.sleep(2)
saving
file_out = paste(getwd(), '/pp7.html', sep = '')
htmlwidgets::saveWidget(as.widget(pp7), file_out)
transforming into interactive graph
pp8 <- ggplotly(p8)
Sys.sleep(2)
saving
file_out = paste(getwd(), '/pp8.html', sep = '')
htmlwidgets::saveWidget(as.widget(pp8), file_out)
transforming into interactive graph
pp9 <- ggplotly(p9)
Sys.sleep(2)
saving
file_out = paste(getwd(), '/pp9.html', sep = '')
htmlwidgets::saveWidget(as.widget(pp9), file_out)
transforming into interactive graph
pp10 <- ggplotly(p10)
Sys.sleep(2)
saving
file_out = paste(getwd(), '/pp10.html', sep = '')
htmlwidgets::saveWidget(as.widget(pp10), file_out)
plotting
layout(pp1, dragmode = 'pan')
Sys.sleep(2)
layout(pp2, dragmode = 'pan')
Sys.sleep(2)
layout(pp3, dragmode = 'pan')
Sys.sleep(2)
layout(pp4, dragmode = 'pan')
Sys.sleep(2)
layout(pp5, dragmode = 'pan')
Sys.sleep(2)
layout(pp6, dragmode = 'pan')
Sys.sleep(2)
layout(pp7, dragmode = 'pan')
Sys.sleep(2)
layout(pp8, dragmode = 'pan')
Sys.sleep(2)
layout(pp9, dragmode = 'pan')
Sys.sleep(2)
layout(pp10, dragmode = 'pan')
Sys.sleep(2)
deallocating memory!
rm(company_list, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10)
downloading & decompressing postcode list of all Hungarian settlements into a corresponding folder
temp <- tempfile()
download.file('http://download.geonames.org/export/zip/HU.zip', temp)
trying URL 'http://download.geonames.org/export/zip/HU.zip'
Content type 'application/zip' length 45266 bytes (44 KB)
==================================================
downloaded 44 KB
unzip(zipfile = temp, exdir = paste(getwd(), '/_tmp/', sep = ''))
file.rename(paste(getwd(), '/_tmp/HU.txt', sep = ''), paste(getwd(), '/_tmp/postcode_list.csv', sep = ''))
[1] TRUE
file.rename(paste(getwd(), '/_tmp/readme.txt', sep = ''), paste(getwd(), '/_tmp/postcode_list_readme.txt', sep = ''))
[1] TRUE
importing into R environment
postcode_list <- data.table(fread(paste(getwd(), '/_tmp/postcode_list.csv', sep = '')))
postcode_list[, ':=' (IrSzam = as.character(V2), Megye = as.character(V4), Varos = as.character(V3), Lat = V10, Lon = V11)]
postcode_list <- unique(postcode_list[, c(13:17)])
changing accented characters into non-accented ones in the downloaded data
postcode_list[, ':=' (Megye = stri_trans_general(Megye, 'Latin-ASCII'), Varos = stri_trans_general(Varos, 'Latin-ASCII'))]
let’s put together those cities which shares the same postcode and average out geocodes on postcode level postcodes shared among at least 2 cities will only have one (averaged) geocode so different settlements will have the same geocodes
postcode_list <- data.table(sqldf('SELECT IrSzam
,GROUP_CONCAT(DISTINCT Megye) AS Megye
,GROUP_CONCAT(DISTINCT Varos) AS Varos
,AVG(Lat) AS Lat
,AVG(Lon) AS Lon
FROM postcode_list
GROUP BY IrSzam'))
Loading required package: tcltk
Let’s check with eyeballing whether we have all of the cities or not (it seems, we have)
dt <- copy(data.table(postcode_list[, .N, by = .(Lat, Lon, Megye)]))
Hungary <- get_map(location = c(16, 45.7, 23, 48.6),
zoom = 8,
source = 'stamen',
maptype = 'toner-lite',
messaging = TRUE,
language = 'hu-HU',
filename = paste(getwd(), '/_tmp/HungaryTemp', sep = ''))
24 tiles required.
trying URL 'http://tile.stamen.com/toner-lite/8/139/88.png'
Content type 'image/png' length 19154 bytes (18 KB)
==================================================
downloaded 18 KB
Map from URL : http://tile.stamen.com/toner-lite/8/139/88.png
trying URL 'http://tile.stamen.com/toner-lite/8/140/88.png'
Content type 'image/png' length 22297 bytes (21 KB)
==================================================
downloaded 21 KB
Map from URL : http://tile.stamen.com/toner-lite/8/140/88.png
trying URL 'http://tile.stamen.com/toner-lite/8/141/88.png'
Content type 'image/png' length 15163 bytes (14 KB)
==================================================
downloaded 14 KB
Map from URL : http://tile.stamen.com/toner-lite/8/141/88.png
trying URL 'http://tile.stamen.com/toner-lite/8/142/88.png'
Content type 'image/png' length 13851 bytes (13 KB)
==================================================
downloaded 13 KB
Map from URL : http://tile.stamen.com/toner-lite/8/142/88.png
trying URL 'http://tile.stamen.com/toner-lite/8/143/88.png'
Content type 'image/png' length 16986 bytes (16 KB)
==================================================
downloaded 16 KB
Map from URL : http://tile.stamen.com/toner-lite/8/143/88.png
trying URL 'http://tile.stamen.com/toner-lite/8/144/88.png'
Content type 'image/png' length 10574 bytes (10 KB)
==================================================
downloaded 10 KB
Map from URL : http://tile.stamen.com/toner-lite/8/144/88.png
trying URL 'http://tile.stamen.com/toner-lite/8/139/89.png'
Content type 'image/png' length 14527 bytes (14 KB)
==================================================
downloaded 14 KB
Map from URL : http://tile.stamen.com/toner-lite/8/139/89.png
trying URL 'http://tile.stamen.com/toner-lite/8/140/89.png'
Content type 'image/png' length 17520 bytes (17 KB)
==================================================
downloaded 17 KB
Map from URL : http://tile.stamen.com/toner-lite/8/140/89.png
trying URL 'http://tile.stamen.com/toner-lite/8/141/89.png'
Content type 'image/png' length 23927 bytes (23 KB)
==================================================
downloaded 23 KB
Map from URL : http://tile.stamen.com/toner-lite/8/141/89.png
trying URL 'http://tile.stamen.com/toner-lite/8/142/89.png'
Content type 'image/png' length 18922 bytes (18 KB)
==================================================
downloaded 18 KB
Map from URL : http://tile.stamen.com/toner-lite/8/142/89.png
trying URL 'http://tile.stamen.com/toner-lite/8/143/89.png'
Content type 'image/png' length 18461 bytes (18 KB)
==================================================
downloaded 18 KB
Map from URL : http://tile.stamen.com/toner-lite/8/143/89.png
trying URL 'http://tile.stamen.com/toner-lite/8/144/89.png'
Content type 'image/png' length 12786 bytes (12 KB)
==================================================
downloaded 12 KB
Map from URL : http://tile.stamen.com/toner-lite/8/144/89.png
trying URL 'http://tile.stamen.com/toner-lite/8/139/90.png'
Content type 'image/png' length 13670 bytes (13 KB)
==================================================
downloaded 13 KB
Map from URL : http://tile.stamen.com/toner-lite/8/139/90.png
trying URL 'http://tile.stamen.com/toner-lite/8/140/90.png'
Content type 'image/png' length 15049 bytes (14 KB)
==================================================
downloaded 14 KB
Map from URL : http://tile.stamen.com/toner-lite/8/140/90.png
trying URL 'http://tile.stamen.com/toner-lite/8/141/90.png'
Content type 'image/png' length 15409 bytes (15 KB)
==================================================
downloaded 15 KB
Map from URL : http://tile.stamen.com/toner-lite/8/141/90.png
trying URL 'http://tile.stamen.com/toner-lite/8/142/90.png'
Content type 'image/png' length 20012 bytes (19 KB)
==================================================
downloaded 19 KB
Map from URL : http://tile.stamen.com/toner-lite/8/142/90.png
trying URL 'http://tile.stamen.com/toner-lite/8/143/90.png'
Content type 'image/png' length 12477 bytes (12 KB)
==================================================
downloaded 12 KB
Map from URL : http://tile.stamen.com/toner-lite/8/143/90.png
trying URL 'http://tile.stamen.com/toner-lite/8/144/90.png'
Content type 'image/png' length 10303 bytes (10 KB)
==================================================
downloaded 10 KB
Map from URL : http://tile.stamen.com/toner-lite/8/144/90.png
trying URL 'http://tile.stamen.com/toner-lite/8/139/91.png'
Content type 'image/png' length 15069 bytes (14 KB)
==================================================
downloaded 14 KB
Map from URL : http://tile.stamen.com/toner-lite/8/139/91.png
trying URL 'http://tile.stamen.com/toner-lite/8/140/91.png'
Content type 'image/png' length 11172 bytes (10 KB)
==================================================
downloaded 10 KB
Map from URL : http://tile.stamen.com/toner-lite/8/140/91.png
trying URL 'http://tile.stamen.com/toner-lite/8/141/91.png'
Content type 'image/png' length 16781 bytes (16 KB)
==================================================
downloaded 16 KB
Map from URL : http://tile.stamen.com/toner-lite/8/141/91.png
trying URL 'http://tile.stamen.com/toner-lite/8/142/91.png'
Content type 'image/png' length 14413 bytes (14 KB)
==================================================
downloaded 14 KB
Map from URL : http://tile.stamen.com/toner-lite/8/142/91.png
trying URL 'http://tile.stamen.com/toner-lite/8/143/91.png'
Content type 'image/png' length 12728 bytes (12 KB)
==================================================
downloaded 12 KB
Map from URL : http://tile.stamen.com/toner-lite/8/143/91.png
trying URL 'http://tile.stamen.com/toner-lite/8/144/91.png'
Content type 'image/png' length 14286 bytes (13 KB)
==================================================
downloaded 13 KB
Map from URL : http://tile.stamen.com/toner-lite/8/144/91.png
saving
file_out = paste(getwd(), '/_tmp/Hungary.rda', sep = '')
save(Hungary, file = file_out)
ggmap(Hungary) +
geom_point(data = dt,
aes(x = Lon,
y = Lat,
color = Megye,
size = N),
alpha = 0.7,
stroke = 1.2,
shape = 1) +
ggtitle('Number of Postcodes per Cities on Hungary')
let’s check a sample of the data
pander(postcode_list[sample(.N, 5)], split.table = 200)
---------------------------------------------
IrSzam Megye Varos Lat Lon
-------- ----------- ------------ ----- -----
3324 Heves Felsotarkany 47.97 20.42
8821 Zala Nagybakonak 46.55 17.05
6717 Csongrad Szeged 46.32 20.03
6334 Bacs-Kiskun Gederlak 46.62 18.92
8081 Fejer Zamoly 47.32 18.42
---------------------------------------------
deallocating memory!
rm(temp)
downloading those companies list which employed people without registration from http://nav.gov.hu/nav/adatbazisok/benemjelentett this link name (and the content also) is changing weekly, so I used Selenium
Page <- 'http://nav.gov.hu/nav/adatbazisok/benemjelentett'
XpathValue <- '//*[@id="portal"]/div[6]/div[2]/div[2]/div[2]/div/div/div/table/tbody/tr[2]/td[2]/a'
rD <- rsDriver(port = 4567L,
browser = 'chrome',
version = 'latest',
chromever = 'latest',
geckover = 'latest',
phantomver = '2.1.1',
verbose = TRUE,
check = TRUE)
checking Selenium Server versions:
BEGIN: PREDOWNLOAD
BEGIN: DOWNLOAD
BEGIN: POSTDOWNLOAD
checking chromedriver versions:
BEGIN: PREDOWNLOAD
BEGIN: DOWNLOAD
BEGIN: POSTDOWNLOAD
checking geckodriver versions:
BEGIN: PREDOWNLOAD
BEGIN: DOWNLOAD
BEGIN: POSTDOWNLOAD
checking phantomjs versions:
BEGIN: PREDOWNLOAD
BEGIN: DOWNLOAD
BEGIN: POSTDOWNLOAD
[1] "Connecting to remote server"
$applicationCacheEnabled
[1] FALSE
$rotatable
[1] FALSE
$mobileEmulationEnabled
[1] FALSE
$networkConnectionEnabled
[1] TRUE
$chrome
$chrome$chromedriverVersion
[1] "2.29.461571 (8a88bbe0775e2a23afda0ceaf2ef7ee74e822cc5)"
$chrome$userDataDir
[1] "/tmp/.org.chromium.Chromium.iH4ir2"
$takesHeapSnapshot
[1] TRUE
$pageLoadStrategy
[1] "normal"
$unhandledPromptBehavior
[1] ""
$databaseEnabled
[1] FALSE
$handlesAlerts
[1] TRUE
$hasTouchScreen
[1] TRUE
$version
[1] "58.0.3029.110"
$platform
[1] "LINUX"
$browserConnectionEnabled
[1] FALSE
$nativeEvents
[1] TRUE
$acceptSslCerts
[1] TRUE
$webdriver.remote.sessionid
[1] "e8aed675-febf-40c9-8eff-4f740a17291f"
$locationContextEnabled
[1] TRUE
$webStorageEnabled
[1] TRUE
$browserName
[1] "chrome"
$takesScreenshot
[1] TRUE
$javascriptEnabled
[1] TRUE
$cssSelectorsEnabled
[1] TRUE
$unexpectedAlertBehaviour
[1] ""
$id
[1] "e8aed675-febf-40c9-8eff-4f740a17291f"
remDr <- rD[['client']]
remDr$navigate(Page)
WebElem <- remDr$findElement(using = 'xpath', value = XpathValue)
URL <- as.character(WebElem$getElementAttribute('href'))
remDr$close()
rD[['server']]$stop()
[1] TRUE
rm(rD)
rm(remDr)
rm(WebElem)
unreg_emp <- data.table(read.xlsx(URL))
changing accented characters into non-accented ones in the downloaded data
colnames(unreg_emp) <- gsub('(', '_', gsub(')', '', gsub(' ', '', stri_trans_totitle(gsub('\\.', ' ', stri_trans_general(colnames(unreg_emp), 'Latin-ASCII'))))), fixed = TRUE)
unreg_emp[, ':=' (AdozoNeve = stri_trans_general(AdozoNeve, 'Latin-ASCII'), Szekhely_Lakcim = stri_trans_general(Szekhely_Lakcim, 'Latin-ASCII'))]
saving
file_out = paste(getwd(), '/_tmp/unreg_emp.RData', sep = '')
fwrite(unreg_emp,
file = file_out,
nThread = getDTthreads())
let’s check a sample of the data
pander(unreg_emp[sample(.N, 5)], split.table = 200)
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Sorszam AdozoNeve Szekhely_Lakcim Adoszam_AdoazonositoJel JogsertestMegallapitoHatarozatKelte JogsertestMegallapitoHatarozatVegrehajthatovaValasanakNapja
--------- ------------------------------ ------------------------------ ------------------------- ------------------------------------- -------------------------------------------------------------
262 Balint Gastro Kft. 4624 Tiszabezded, Dozsa Gy. u. 25088681-2-15 42328 42378
39.
1027 Equinvest Kft. 7100 Szekszard, Rakoczi u. 23093263-2-08 42199 42244
134.
1805 JERI Kereskedelmi Bt. ,,kt.a." 1035 Budapest, Miklos utca 13. 21312818-2-41 42634 42670
VIII. em. 42.
3205 RESTO FOOD Kft. 3281 Karacsond, Nagy Imre ut 14818013-2-10 41886 41909
2.
1232 Folcz Kereskedelmi es 2900 Komarom, Diofa ut 16. 20326973-2-11 42347 42355
Szolgaltato Bt.
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Table: Table continues below
-----------------
KozzetetelNapja
-----------------
42422
42268
42751
42144
42422
-----------------
extract the necessary data out of unreg_emp
unreg_emp <- unreg_emp[, names(unreg_emp)[4], with = FALSE]
unreg_emp[, unreg_emp := TRUE]
unreg_emp[, Adoszam := substr(stri_trim(Adoszam_AdoazonositoJel), 1, 8)]
unreg_emp[, Adoszam_AdoazonositoJel := NULL]
unreg_emp <- unique(unreg_emp)
saving
file_out = paste(getwd(), '/_tmp/unreg_emp.RData', sep = '')
fwrite(unreg_emp,
file = file_out,
nThread = getDTthreads())
let’s check a sample of the data
pander(unreg_emp[sample(.N, 5)], split.table = 200)
---------------------
unreg_emp Adoszam
----------- ---------
TRUE 13163602
TRUE 66485494
TRUE 24722412
TRUE 24906120
TRUE 13354309
---------------------
creating a new, modified instance of company list - importing
file_in = paste(getwd(), '/_tmp/company_list.RData', sep = '')
company_list_1 <- data.table(fread(file_in))
creating a new, modified instance of company list - creating new columns
company_list_1[nchar(sub(' .*', '', stri_trim(Szekhely))) == 4, IrSzam := substr(sub(' .*', '', stri_trim(Szekhely)), 1, 4)]
company_list_1[Adoszam == substr(AdoszamHosszu,1 , 8) & nchar(AdoszamHosszu) == 13, AFAKod := substr(AdoszamHosszu, 10, 10)]
company_list_1[Adoszam == substr(AdoszamHosszu,1 , 8) & nchar(AdoszamHosszu) == 13, NAVTeruletKod := substr(AdoszamHosszu, 12, 13)]
company_list_1[, AdoszamHosszu := NULL]
let’s check a sample of the data
pander(company_list_1[sample(.N, 5)], split.table = 200)
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Adoszam SajatToke MerlegEredmeny Arbevetel Szekhely SzekhelyVaros FotevekenysegText Letszam Cegkora AktivPrivatTulajdonos
--------- ----------- ---------------- ----------- ------------------------------ --------------- ------------------------------------------------ --------- --------- -----------------------
24319687 0 0 0 1037 Budapest Bokor 9 Budapest SzakmaiTudomanyosMuszakiTevekenyseg 1 3 1
24128595 582000 -49000 0 1055 Budapest Szentistvan 1 Budapest InformacioKommunikacio NA 4 2
23451751 3556000 496000 19775000 2051 Biatorbagy Szily 2 Biatorbagy KereskedelemGepjarmujavitas 2 5 1
12942503 3802000 614000 3312000 7720 Lovaszheteny Szabadsag 17 Lovaszheteny NemIsmert 1 14 2
23156470 -1519000 108000 8490000 7624 Pecs Rakoczi 24 Pecs AdminisztrativEsSzolgaltatastTamogatoTevekenyseg 2 6 1
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Table: Table continues below
--------------------------------------------------------------------------------
AktivTulajdonosSzam Cegforma BankiUgyfel IrSzam AFAKod NAVTeruletKod
--------------------- ---------- ------------- -------- -------- ---------------
1 Kft FALSE 1037 2 41
2 Kft FALSE 1055 2 41
1 Kft FALSE 2051 2 13
2 Kft FALSE 7720 1 02
1 Kft FALSE 7624 2 02
--------------------------------------------------------------------------------
let’s denote those companies which used unregistered employees - right join
company_list_1[, Adoszam := as.character(Adoszam)]
setkey(unreg_emp, Adoszam)
setkey(company_list_1, Adoszam)
company_list_1 <- unreg_emp[company_list_1]
replacing NA with FALSE in unreg_emp column
company_list_1[is.na(unreg_emp), unreg_emp := FALSE]
let’s check a sample of the data
pander(company_list_1[unreg_emp, ][sample(.N, 5)], split.table = 200)
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
unreg_emp Adoszam SajatToke MerlegEredmeny Arbevetel Szekhely SzekhelyVaros FotevekenysegText Letszam Cegkora AktivPrivatTulajdonos
----------- --------- ----------- ---------------- ----------- ------------------------ --------------- ------------------------------------------------ --------- --------- -----------------------
TRUE 24090524 0 0 0 1191 Budapest Hamvas 7 Budapest AdminisztrativEsSzolgaltatastTamogatoTevekenyseg NA 4 2
TRUE 25030011 3430000 206000 11017000 1039 Budapest Zsirai 2 Budapest SzallitasRaktarozas 3 2 1
TRUE 12649372 -19187000 -2706000 9524000 3521 Miskolc Szacsvai 12 Miskolc KereskedelemGepjarmujavitas NA 15 1
TRUE 13636445 608000 -3099000 58956000 7624 Pecs Xaver 19 Pecs Szallashely-SzolgaltatasVendeglatas NA 11 1
NA 13628596 NA NA NA NA NA NA NA NA NA
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Table: Table continues below
----------------------------------------------------------------------------------------------
AktivTulajdonosSzam Cegforma BankiUgyfel IrSzam AFAKod NAVTeruletKod i.unreg_emp
--------------------- ---------- ------------- -------- -------- --------------- -------------
2 Kft FALSE 1191 2 43 TRUE
1 Kft FALSE 1039 2 41 TRUE
1 Kft FALSE 3521 2 05 TRUE
1 Kft FALSE 7624 2 02 TRUE
NA NA NA NA NA NA TRUE
----------------------------------------------------------------------------------------------
joining latitude and longitude data of postcodes to company list - right join
setkey(postcode_list, IrSzam)
setkey(company_list_1, IrSzam)
company_list_1 <- postcode_list[company_list_1]
i <- company_list_1[is.na(Lat), .N]
j <- company_list_1[, .N]
pander(paste('There are', i, 'clients out of', j, 'without geocode proxy', sep = ' '), split.table = 200)
There are 781 clients out of 435439 without geocode proxy
Let’s check with eyeballing in which postcode area are the current clients’ hq
dt <- copy(data.table(company_list_1[!is.na(Lat), .N, by = .(BankiUgyfel, Lat, Lon)]))
ggmap(Hungary) +
geom_point(data = dt,
aes(x = Lon,
y = Lat,
color = BankiUgyfel,
size = N),
alpha = 0.7,
stroke = 1.2,
shape = 1) +
ggtitle('Number of Companies per Postcode Areas on Hungary')
let’s check a sample of the data
pander(company_list_1[!is.na(Lat), ][sample(.N, 5)], split.table = 200)
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
IrSzam Megye Varos Lat Lon unreg_emp Adoszam SajatToke MerlegEredmeny Arbevetel Szekhely SzekhelyVaros FotevekenysegText Letszam
-------- ----------- ------------- ----- ----- ----------- --------- ----------- ---------------- ----------- --------------------------- --------------- ----------------------------------- ---------
2340 Pest Kiskunlachaza 47.2 19.02 FALSE 23867747 -1373000 321000 971000 2340 Kiskunlachaza Dozsa 77 Kiskunlachaza Feldolgozoipar NA
2030 Pest Erd 47.37 18.93 FALSE 10902941 -2279000 5862000 5426000 2030 Erd Terasz 56 Erd Epitoipar 1
4033 Hajdu-Bihar Debrecen 47.77 21.24 FALSE 14513017 -2495000 238000 4057000 4033 Debrecen Gyorgy 9 Debrecen SzakmaiTudomanyosMuszakiTevekenyseg NA
2030 Pest Erd 47.37 18.93 FALSE 22701015 8300000 -490000 0 2030 Erd Erzsebet 46 Erd Oktatas NA
1193 Budapest Budapest 47.5 19.08 FALSE 22725134 -1164000 -467000 4318000 1193 Budapest Klapka 2 Budapest SzallitasRaktarozas NA
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Table: Table continues below
---------------------------------------------------------------------------------------------------------
Cegkora AktivPrivatTulajdonos AktivTulajdonosSzam Cegforma BankiUgyfel AFAKod NAVTeruletKod
--------- ----------------------- --------------------- ---------- ------------- -------- ---------------
5 2 2 Kft TRUE 2 13
23 2 2 Kft FALSE 2 13
8 2 2 Kft FALSE 1 09
6 2 2 Bt FALSE 1 13
6 1 1 Kft FALSE 2 43
---------------------------------------------------------------------------------------------------------
joining latitude and longitude data of postcodes to branch postcode list - right join
setkey(branch_zip_list, IrSzam)
setkey(postcode_list, IrSzam)
branch_postcode_list <- postcode_list[branch_zip_list]
saving
file_out = paste(getwd(), '/_tmp/branch_postcode_list.RData', sep = '')
fwrite(branch_postcode_list,
file = file_out,
nThread = getDTthreads())
Let’s check with eyeballing in which postcode area are the branches on the bank
ggmap(Hungary) +
geom_point(data = branch_postcode_list,
aes(x = Lon,
y = Lat),
size = 1,
color = 'green',
alpha = 0.7,
stroke = 1.2,
shape = 1) +
ggtitle('Bank Branches All Over Hungary')
let’s check a sample of the data
pander(branch_postcode_list[sample(.N, 5)], split.table = 200)
--------------------------------------------
IrSzam Megye Varos Lat Lon
-------- ----------- ----------- ----- -----
1211 Budapest Budapest 47.5 19.08
6230 Bacs-Kiskun Soltvadkert 46.58 19.4
6413 Bacs-Kiskun Kunfeherto 46.35 19.42
6088 Bacs-Kiskun Apostag 46.88 18.95
6331 Bacs-Kiskun Fokto 46.52 18.92
--------------------------------------------
deallocating memory!
rm(Hungary)
calculating distance proxy between firms address and a the nearest branch of bank
postcode_geocode_list <- copy(data.table(unique(company_list_1[!is.na(Lat), .(IrSzam, Lat, Lon)])))
setkey(postcode_geocode_list, IrSzam)
dist_matrix <- copy(data.table(postcode_geocode_list[, IrSzam]))
colnames(dist_matrix) <- 'IrSzam'
dist_matrix[, LegkozFiokTavProxy := 10000000]
for (i in 1:nrow(postcode_geocode_list)) {
for (j in 1:nrow(branch_postcode_list)) {
dist_matrix[i, branch_postcode_list[j, IrSzam]] <- gdist(lon.1 = postcode_geocode_list[i, Lon],
lat.1 = postcode_geocode_list[i, Lat],
lon.2 = branch_postcode_list[j, Lon],
lat.2 = branch_postcode_list[j, Lat],
units = 'km')
}
dist_matrix[i, LegkozFiokTavProxy := min(as.numeric(dist_matrix[i, 3:(ncol(dist_matrix)), with = FALSE]), na.rm = TRUE)]
}
saving
file_out = paste(getwd(), '/_tmp/dist_matrix.RData', sep = '')
fwrite(dist_matrix,
file = file_out,
nThread = getDTthreads())
let’s check a sample of the distance matrix data
pander(dist_matrix[sample(.N, 5)], split.table = 200)
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
IrSzam LegkozFiokTavProxy 1211 2300 2315 2336 2340 2344 6000 6031 6032 6033 6041 6044 6045 6050 6060 6065 6066 6077 6078 6087 6088 6097 6100 6112
-------- -------------------- ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------
3378 96.52 112.5 135.9 124.4 122 129.6 137.4 110.2 103.1 110.6 118.9 120.5 119.9 114.2 110.3 96.52 105.5 111.3 137.2 130.5 149.4 153.7 127.9 126.2 132.6
4063 121.3 171.1 188 179.9 176.1 182.1 187.2 142.9 132.5 139.8 149.3 160.5 162.3 157.6 151.6 121.3 131.9 136.1 171.9 162.8 195.3 199.1 170.7 151.6 153.3
2522 33.23 33.23 58.31 42.77 50.13 56.61 68.3 117.4 124.2 125.1 123.5 99.46 92.59 89.6 95.85 132.6 130.4 134.3 117.1 121.3 85.79 88.87 91.69 136.5 148.3
8948 169.3 202.1 179.5 189 191.8 185.4 180.9 236.9 248.5 243.5 235.2 214.1 210.1 213.4 220.7 261.3 252.4 251.8 212.6 222.8 176.4 173.5 202.1 241.6 248.8
9985 210.7 231.1 213.5 220.4 224.3 219 216.3 275.1 286.5 282.1 274.2 251.9 247.3 250 257.5 299.2 290.8 290.7 252.3 262.4 214.2 211.7 239.6 281.5 289.3
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Table: Table continues below
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
6113 6114 6115 6120 6131 6133 6134 6135 6211 6230 6237 6300 6320 6323 6325 6326 6328 6331 6332 6333 6334 6336 6345 6346 6347 6353 6400 6413 6421 6422 6423 6449 6500
------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------
136.7 134.5 124.4 152.4 148.1 143.9 151.4 157 147.9 156.2 167.3 179 156 157.3 154.5 163.5 169.4 184 179.8 178 175.7 173.1 193.3 200.4 204.2 193 166.7 177.3 181.6 186.3 186.7 192 211.1
160 163.4 153.7 175 173.7 167.8 170.4 175.7 181.7 188 200.3 216.1 198.8 201.5 194.8 203.6 208.6 221.6 218.5 217.6 215.5 209.4 225.4 232.3 235.7 227.3 193.4 203.6 205.1 207.7 206.3 216.3 241.6
144.2 130.9 127 152.1 143.4 145.6 159.5 163.1 122.7 131.5 133.2 127.6 99.02 94.32 106.3 110.8 116.2 128.7 123.2 119.4 117.7 126.8 150.2 155.3 158.8 143.9 149.3 155.8 165.3 172.9 177.4 169.7 166
240.5 228.4 233.7 234.6 228.7 235.4 244.6 243.6 207.1 207.4 197 176.6 177.8 173 185.3 178.8 176.8 171.5 170.9 169.3 170.4 183.1 185.6 183 182.8 177.9 216.3 212.5 220.2 225.3 231.2 213.9 183.9
281.2 268.6 273.2 276.3 269.9 276.4 286.3 285.6 247.6 248.6 238.7 218.5 217 211.8 225.1 219.2 217.8 213.7 212.6 210.7 211.7 224.8 228.8 226.6 226.5 220.9 258.5 255.3 263.3 268.6 274.6 257.4 227.9
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
let’s enriching the company list data with nearest branch distance proxy
dt <- copy(data.table(dist_matrix[, 1:2]))
setkey(dt, IrSzam)
setkey(company_list_1, IrSzam)
company_list_1 <- dt[company_list_1]
saving
file_out = paste(getwd(), '/_tmp/company_list_1.RData', sep = '')
fwrite(company_list_1,
file = file_out,
nThread = getDTthreads())
let’s check a sample of the data
pander(company_list_1[sample(.N, 5)], split.table = 200)
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
IrSzam LegkozFiokTavProxy Megye Varos Lat Lon unreg_emp Adoszam SajatToke MerlegEredmeny Arbevetel Szekhely SzekhelyVaros
-------- -------------------- -------------------- ----------------- ----- ----- ----------- --------- ----------- ---------------- ----------- --------------------------- -----------------
2162 24.61 Pest Orbottyan 47.68 19.27 FALSE 14614352 8013000 2971000 27760000 2162 Orbottyan Dozsa 9547 Orbottyan
1088 1.217 Budapest Budapest 47.49 19.09 FALSE 26851284 0 0 0 1088 Budapest Muzeum 15 Budapest
2310 4.21 Pest Szigetszentmiklos 47.35 19.05 FALSE 14142424 39000 -8000 0 2310 Szigetszentmiklos Szigetszentmiklos
Sargarozsa 12
5400 41.26 Jasz-Nagykun-Szolnok Mezotur 47 20.63 FALSE 11508052 33882000 -241000 9157000 5400 Mezotur Kossuth 6 Mezotur
1149 3.623 Budapest Budapest 47.51 19.13 FALSE 13121927 -6565000 -2500000 15129000 1149 Budapest Nagylajos 134 Budapest
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Table: Table continues below
------------------------------------------------------------------------------------------------------------------------------------------------------
FotevekenysegText Letszam Cegkora AktivPrivatTulajdonos AktivTulajdonosSzam Cegforma BankiUgyfel AFAKod NAVTeruletKod
---------------------------------- --------- --------- ----------------------- --------------------- ---------- ------------- -------- ---------------
NemIsmert 3 8 1 1 Kft FALSE 2 13
InformacioKommunikacio NA 23 2 2 Bt FALSE 3 42
Oktatas NA 9 2 2 Kft FALSE 1 13
Human-EgeszsegugyiSzocialisEllatas NA 18 1 1 Kft FALSE 2 16
NemIsmert 4 13 NA NA Kft FALSE 2 42
------------------------------------------------------------------------------------------------------------------------------------------------------
deallocating memory
rm(dist_matrix)
let’ s denote whether parent companies are clients also or not
dt <- company_list_1[BankiUgyfel == TRUE & nchar(Adoszam) == 8, .(BankiUgyfel, Adoszam)]
setnames(dt, 'Adoszam', 'AdoszamTulaj')
setnames(dt, 'BankiUgyfel', 'TulajCegesBankiUgyfel')
owner_company_list_1 <- copy(data.table(owner_company_list))
right join
setkey(dt, AdoszamTulaj)
setkey(owner_company_list_1, AdoszamTulaj)
owner_company_list_1 <- dt[owner_company_list_1]
owner_company_list_1[is.na(TulajCegesBankiUgyfel), TulajCegesBankiUgyfel := FALSE]
saving
file_out = paste(getwd(), '/_tmp/owner_company_list_1.RData', sep = '')
fwrite(owner_company_list_1,
file = file_out,
nThread = getDTthreads())
let’s check the data
pander(owner_company_list_1[, .N, by = (TulajCegesBankiUgyfel)], split.table = 200)
-----------------------------
TulajCegesBankiUgyfel N
----------------------- -----
FALSE 32651
TRUE 528
-----------------------------
let’s count the number of those owner companies which are customers of the Bank by affiliate companies
dt <- copy(data.table(owner_company_list_1[TulajCegesBankiUgyfel == TRUE, ]))
dt <- data.table(sqldf('SELECT Adoszam
,COUNT(DISTINCT TulajCegesBankiUgyfel) AS TulajCegesBankiUgyfelNr
FROM dt
GROUP BY Adoszam'))
let’s check a sample of the data
pander(dt[sample(.N, 5)], split.table = 200)
-----------------------------------
Adoszam TulajCegesBankiUgyfelNr
--------- -------------------------
12634402 1
23113165 1
25414905 1
13641346 1
12459418 1
-----------------------------------
let’s join the company list with the modified owner company list
setkey(dt, Adoszam)
setkey(company_list_1, Adoszam)
company_list_1 <- dt[company_list_1]
company_list_1[is.na(TulajCegesBankiUgyfelNr), TulajCegesBankiUgyfelNr := 0]
saving
file_out = paste(getwd(), '/_tmp/company_list_1.RData', sep = '')
fwrite(company_list_1,
file = file_out,
nThread = getDTthreads())
let’s check a sample of the data
pander(company_list_1[sample(.N, 5)], split.table = 200)
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Adoszam TulajCegesBankiUgyfelNr IrSzam LegkozFiokTavProxy Megye Varos Lat Lon unreg_emp SajatToke MerlegEredmeny Arbevetel Szekhely
--------- ------------------------- -------- -------------------- ----------------- ----------- ----- ----- ----------- ----------- ---------------- ----------- --------------------------
22674784 0 9022 96.76 Gyor-Moson-Sopron Gyor 47.61 17.78 FALSE 3353000 575000 39936000 9022 Gyor Bajcsy 43
14249734 0 1151 0 Budapest Budapest 47.5 19.08 FALSE 30249000 23601000 293416000 1151 Budapest Szekely 11
13981604 0 2039 17.97 Pest Pusztazamor 47.4 18.78 FALSE 0 0 0 2039 Pusztazamor Erkel 2
24695181 0 4262 160.6 Hajdu-Bihar Nyiracsad 47.6 21.98 FALSE 1000 -194000 653000 4262 Nyiracsad Kossuth 114
28550219 0 1031 8.404 Budapest Budapest 47.57 19.05 FALSE -3740000 -2000 0 1031 Budapest Varsa 3
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Table: Table continues below
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
SzekhelyVaros FotevekenysegText Letszam Cegkora AktivPrivatTulajdonos AktivTulajdonosSzam Cegforma BankiUgyfel AFAKod NAVTeruletKod
--------------- ----------------------------------- --------- --------- ----------------------- --------------------- ---------- ------------- -------- ---------------
Gyor SzakmaiTudomanyosMuszakiTevekenyseg 4 6 1 1 Kft FALSE 2 08
Budapest KereskedelemGepjarmujavitas 7 8 2 2 Kft FALSE 2 42
Pusztazamor Ingatlanugyletek NA 9 NA NA FALSE 2 13
Nyiracsad KereskedelemGepjarmujavitas NA 3 1 1 Kft FALSE 2 09
Budapest NemIsmert NA 22 2 2 Bt FALSE 1 41
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
deallocating memory
rm(owner_company_list, owner_company_list_1)
prerequisite of detection of Adoszam values being unique is to order them by that column
company_list_1 <- company_list_1[order(Adoszam)]
let’s check column values validity again
ct <- check_that(company_list_1,
valid_notation_BankiUgyfel = BankiUgyfel == 0
| BankiUgyfel == 1,
distinct_Adoszam = rle(sort(Adoszam))[[1]] == 1,
complete_Adoszam = stri_length(Adoszam) == 8,
initcap_Cegforma = Cegforma == stri_trans_totitle(Cegforma),
valid_AktivTulajdonosSzam = AktivTulajdonosSzam >= 1,
valid_AktivPrivatTulajdonos = AktivTulajdonosSzam >= AktivPrivatTulajdonos,
filled_AktivTulajdonosSzam_and_AktivPrivatTulajdonos = stri_length(AktivTulajdonosSzam) >= 1
& stri_length(AktivPrivatTulajdonos) >= 1,
not_negative_Cegkora = Cegkora >= 0,
under_UCL_Cegkora = Cegkora <= mean(Cegkora, na.rm = TRUE) + 3 * sd(Cegkora, na.rm = TRUE),
valid_Letszam = Letszam >= 1,
filled_FotevekenysegText = stri_length(FotevekenysegText) >= 1,
valid_postcode = nchar(sub(' .*', '', stri_trim(Szekhely))) == 4,
valid_SzekhelyVaros = stri_length(SzekhelyVaros) >= 2
& grepl('\\d', SzekhelyVaros) == FALSE,
initcap_SzekhelyVaros = SzekhelyVaros == stri_trans_totitle(SzekhelyVaros),
valid_Szekhely = stri_length(Szekhely) >= 11
& grepl('\\D', substr(stri_trim(Szekhely), 1, 4)) == FALSE
& grepl('\\^', Szekhely) == FALSE,
initcap_Szekhely = Szekhely == stri_trans_totitle(Szekhely),
filled_Arbevetel = stri_length(Arbevetel) >= 1,
filled_MerlegEredmeny = stri_length(MerlegEredmeny) >= 1,
filled_SajatToke = stri_length(SajatToke) >= 1,
filled_FotevekenysegText_and_AktivTulajdonosSzam_and_Cegforma = stri_length(FotevekenysegText) >= 1
& stri_length(AktivTulajdonosSzam) >= 1
& stri_length(Cegforma) >= 1,
filled_Lat_Lon = !is.na(Lat)
& !is.na(Lon),
filled_unreg_emp = !is.na(unreg_emp),
filled_AFAKod = !is.na(AFAKod),
filled_Megye = !is.na(Megye),
filled_TulajCegesBankiUgyfelNr = !is.na(TulajCegesBankiUgyfelNr)
)
dt <- data.table(summary(ct))
dt <- dt[, c(1:5, 8)]
let’s see the validity of the data
pander(dt, split.table = 200)
--------------------------------------------------------------------------------------------------------------------------------
rule items passes fails nNA expression
------------------------------------------------------------- ------- -------- ------- ------ ----------------------------------
valid_notation_BankiUgyfel 435439 435439 0 0 BankiUgyfel == 0 | BankiUgyfel
== 1
distinct_Adoszam 435439 435439 0 0 rle(sort(Adoszam))[[1]] == 1
complete_Adoszam 435439 435439 0 0 stri_length(Adoszam) == 8
initcap_Cegforma 435439 435439 0 0 Cegforma ==
stri_trans_totitle(Cegforma)
valid_AktivTulajdonosSzam 435439 419130 0 16309 AktivTulajdonosSzam >= 1
valid_AktivPrivatTulajdonos 435439 419130 0 16309 AktivTulajdonosSzam >=
AktivPrivatTulajdonos
filled_AktivTulajdonosSzam_and_AktivPrivatTulajdonos 435439 419130 0 16309 stri_length(AktivTulajdonosSzam)
>= 1 &
stri_length(AktivPrivatTulajdonos)
>= 1
not_negative_Cegkora 435439 435055 0 384 Cegkora >= 0
under_UCL_Cegkora 435439 434151 904 384 Cegkora <= mean(Cegkora, na.rm
= TRUE) + 3 * sd(Cegkora,
na.rm = TRUE)
valid_Letszam 435439 122585 0 312854 Letszam >= 1
filled_FotevekenysegText 435439 435439 0 0 stri_length(FotevekenysegText)
>= 1
valid_postcode 435439 435426 13 0 nchar(sub(" .*", "",
stri_trim(Szekhely))) == 4
valid_SzekhelyVaros 435439 435430 9 0 stri_length(SzekhelyVaros) >=
2 & grepl("\\d",
SzekhelyVaros) == FALSE
initcap_SzekhelyVaros 435439 435439 0 0 SzekhelyVaros ==
stri_trans_totitle(SzekhelyVaros)
valid_Szekhely 435439 435422 17 0 stri_length(Szekhely) >= 11 &
grepl("\\D",
substr(stri_trim(Szekhely), 1,
4)) == FALSE & grepl("\\^",
Szekhely) == FALSE
initcap_Szekhely 435439 435439 0 0 Szekhely ==
stri_trans_totitle(Szekhely)
filled_Arbevetel 435439 435439 0 0 stri_length(Arbevetel) >= 1
filled_MerlegEredmeny 435439 435439 0 0 stri_length(MerlegEredmeny) >=
1
filled_SajatToke 435439 435439 0 0 stri_length(SajatToke) >= 1
filled_FotevekenysegText_and_AktivTulajdonosSzam_and_Cegforma 435439 387236 35205 12998 stri_length(FotevekenysegText)
>= 1 &
stri_length(AktivTulajdonosSzam)
>= 1 & stri_length(Cegforma)
>= 1
filled_Lat_Lon 435439 434658 781 0 !is.na(Lat) & !is.na(Lon)
filled_unreg_emp 435439 435439 0 0 !is.na(unreg_emp)
filled_AFAKod 435439 434828 611 0 !is.na(AFAKod)
filled_Megye 435439 434658 781 0 !is.na(Megye)
filled_TulajCegesBankiUgyfelNr 435439 435439 0 0 !is.na(TulajCegesBankiUgyfelNr)
--------------------------------------------------------------------------------------------------------------------------------
Some AktivPrivatTulajdonos, AktivTulajdonosSzam values are missing / I impute them with the median value in each Megye with same FotevekenysegText and AFAKod
subsetting into dt
dt <- copy(data.table(company_list_1[!is.na(AFAKod)
& !is.na(Megye)
& !is.na(FotevekenysegText),
.(Adoszam,
AktivPrivatTulajdonos,
AktivTulajdonosSzam,
AFAKod,
Megye,
FotevekenysegText)]))
factorization of categorical variables
dt[, ':=' (AFAKod = as.factor(AFAKod),
Megye = as.factor(Megye),
FotevekenysegText = as.factor(FotevekenysegText))]
checking the number of missing values of AktivPrivatTulajdonos
n <- dt[is.na(AktivPrivatTulajdonos), .N]
pandoc.header(paste('The prior number of missing AktivPrivatTulajdonos values: ', n, sep = ''))
# The prior number of missing AktivPrivatTulajdonos values: 15942
imputation
formula <- AktivPrivatTulajdonos ~ AFAKod + Megye + FotevekenysegText
dt <- impute_median(dt, formula)
Sys.sleep(3)
rounding the imputed values
dt[!is.na(AktivPrivatTulajdonos), AktivPrivatTulajdonos := round(AktivPrivatTulajdonos, 0)]
checking the number of missing values of AktivTulajdonosSzam
n <- dt[is.na(AktivTulajdonosSzam), .N]
pandoc.header(paste('The prior number of missing AktivTulajdonosSzam values: ', n, sep = ''))
# The prior number of missing AktivTulajdonosSzam values: 15942
imputation
formula <- AktivTulajdonosSzam ~ AFAKod + Megye + FotevekenysegText
dt <- impute_median(dt, formula)
Sys.sleep(3)
rounding the imputed values
dt[!is.na(AktivTulajdonosSzam), AktivTulajdonosSzam := round(AktivTulajdonosSzam, 0)]
subsetting
dt <- data.table(dt[, .(Adoszam, AktivPrivatTulajdonos, AktivTulajdonosSzam)])
initial preparation of company list 2
company_list_2 <- copy(data.table(company_list_1))
right join the imputed values into company_list_2
setkey(dt, Adoszam)
setkey(company_list_2, Adoszam)
company_list_2 <- data.table(dt[company_list_2])
company_list_2[is.na(AktivPrivatTulajdonos), AktivPrivatTulajdonos := i.AktivPrivatTulajdonos]
company_list_2[is.na(AktivTulajdonosSzam), AktivTulajdonosSzam := i.AktivTulajdonosSzam]
company_list_2[, ':=' (i.AktivPrivatTulajdonos = NULL, i.AktivTulajdonosSzam = NULL)]
saving
file_out = paste(getwd(), '/_tmp/company_list_2.RData', sep = '')
fwrite(company_list_2,
file = file_out,
nThread = getDTthreads())
let’s check the imputation again
ct <- check_that(company_list_2,
filled_AktivTulajdonosSzam = stri_length(AktivTulajdonosSzam) >= 1,
filled_AktivPrivatTulajdonos = stri_length(AktivPrivatTulajdonos) >= 1,
filled_AktivTulajdonosSzam_and_AktivPrivatTulajdonos = stri_length(AktivTulajdonosSzam) >= 1
& stri_length(AktivPrivatTulajdonos) >= 1,
valid_AktivPrivatTulajdonos = AktivTulajdonosSzam >= AktivPrivatTulajdonos,
filled_Cegforma = stri_length(Cegforma) >= 1,
filled_AktivTulajdonosSzam_and_AktivPrivatTulajdonos_and_Cegforma = stri_length(AktivTulajdonosSzam) >= 1
& stri_length(AktivPrivatTulajdonos) >= 1
& stri_length(Cegforma) >= 1
)
dt <- data.table(summary(ct))
dt <- dt[, c(1:5, 8)]
let’s see the validity of the data
pander(dt, split.table = 200)
-----------------------------------------------------------------------------------------------------------------------------------
rule items passes fails nNA expression
----------------------------------------------------------------- ------- -------- ------- ----- ----------------------------------
filled_AktivTulajdonosSzam 435439 435071 0 368 stri_length(AktivTulajdonosSzam)
>= 1
filled_AktivPrivatTulajdonos 435439 435071 0 368 stri_length(AktivPrivatTulajdonos)
>= 1
filled_AktivTulajdonosSzam_and_AktivPrivatTulajdonos 435439 435071 0 368 stri_length(AktivTulajdonosSzam)
>= 1 &
stri_length(AktivPrivatTulajdonos)
>= 1
valid_AktivPrivatTulajdonos 435439 435071 0 368 AktivTulajdonosSzam >=
AktivPrivatTulajdonos
filled_Cegforma 435439 400234 35205 0 stri_length(Cegforma) >= 1
filled_AktivTulajdonosSzam_and_AktivPrivatTulajdonos_and_Cegforma 435439 400149 35205 85 stri_length(AktivTulajdonosSzam)
>= 1 &
stri_length(AktivPrivatTulajdonos)
>= 1 & stri_length(Cegforma)
>= 1
-----------------------------------------------------------------------------------------------------------------------------------
deallocating memory!!
rm(company_list_1)
Some Cegforma values are missing / I impute them with the median value in each Megye with same FotevekenysegText and AFAKod
subsetting into dt
dt <- copy(data.table(company_list_2[!is.na(Adoszam)
& !is.na(AktivPrivatTulajdonos)
& !is.na(AktivTulajdonosSzam)
& !is.na(Megye)
& !is.na(FotevekenysegText)
& !is.na(unreg_emp)
& !is.na(AFAKod),
.(Adoszam,
Cegforma,
AktivPrivatTulajdonos,
AktivTulajdonosSzam,
Megye,
FotevekenysegText,
unreg_emp,
AFAKod)]))
factorization of categorical variables
dt[, ':=' (Cegforma = as.factor(Cegforma),
Megye = as.factor(Megye),
FotevekenysegText = as.factor(FotevekenysegText),
unreg_emp = as.factor(unreg_emp),
AFAKod = as.factor(AFAKod)
)]
imputation of Cegforma
formula <- Cegforma ~ AktivPrivatTulajdonos + AktivTulajdonosSzam + Megye + FotevekenysegText + unreg_emp + AFAKod
dt <- data.table(impute_mf(dt, formula))
missForest iteration 1 in progress...done!
missForest iteration 2 in progress...done!
joining
dt <- dt[, .(Adoszam, Cegforma)]
setkey(dt, Adoszam)
setkey(company_list_2, Adoszam)
company_list_2 <- data.table(dt[company_list_2])
company_list_2[is.na(Cegforma), Cegforma := i.Cegforma]
company_list_2[, i.Cegforma := NULL]
saving
file_out = paste(getwd(), '/_tmp/company_list_2.RData', sep = '')
fwrite(company_list_2,
file = file_out,
nThread = getDTthreads())
let’s check the imputation again
ct <- check_that(company_list_2,
filled_AktivTulajdonosSzam = stri_length(AktivTulajdonosSzam) >= 1,
filled_AktivPrivatTulajdonos = stri_length(AktivPrivatTulajdonos) >= 1,
filled_AktivTulajdonosSzam_and_AktivPrivatTulajdonos = stri_length(AktivTulajdonosSzam) >= 1
& stri_length(AktivPrivatTulajdonos) >= 1,
valid_AktivPrivatTulajdonos = AktivTulajdonosSzam >= AktivPrivatTulajdonos,
filled_Cegforma = stri_length(Cegforma) >= 1,
filled_AktivTulajdonosSzam_and_AktivPrivatTulajdonos_and_Cegforma = stri_length(AktivTulajdonosSzam) >= 1
& stri_length(AktivPrivatTulajdonos) >= 1
& stri_length(Cegforma) >= 1
)
temp <- data.table(summary(ct))
temp <- temp[, c(1:5, 8)]
let’s see the validity of the data
pander(temp, split.table = 200)
-----------------------------------------------------------------------------------------------------------------------------------
rule items passes fails nNA expression
----------------------------------------------------------------- ------- -------- ------- ----- ----------------------------------
filled_AktivTulajdonosSzam 435439 435071 0 368 stri_length(AktivTulajdonosSzam)
>= 1
filled_AktivPrivatTulajdonos 435439 435071 0 368 stri_length(AktivPrivatTulajdonos)
>= 1
filled_AktivTulajdonosSzam_and_AktivPrivatTulajdonos 435439 435071 0 368 stri_length(AktivTulajdonosSzam)
>= 1 &
stri_length(AktivPrivatTulajdonos)
>= 1
valid_AktivPrivatTulajdonos 435439 435071 0 368 AktivTulajdonosSzam >=
AktivPrivatTulajdonos
filled_Cegforma 435439 400234 35205 0 stri_length(Cegforma) >= 1
filled_AktivTulajdonosSzam_and_AktivPrivatTulajdonos_and_Cegforma 435439 400149 35205 85 stri_length(AktivTulajdonosSzam)
>= 1 &
stri_length(AktivPrivatTulajdonos)
>= 1 & stri_length(Cegforma)
>= 1
-----------------------------------------------------------------------------------------------------------------------------------
Let’s take a look at histograms of columns (variables) of company list 2
p11 <- ggplot(company_list_2) +
geom_histogram(aes(x = log10(SajatToke + 1)),
fill = 'blue',
position = 'identity',
binwidth = 0.3,
alpha = I(0.7)) +
ggtitle('Histogram for log10_SajatToke') +
facet_grid(BankiUgyfel ~ ., scales = 'free', labeller = labeller(BankiUgyfel = c('TRUE' = 'Banki Ugyfel',
'FALSE' = 'Not Banki Ugyfel'))) +
xlab('log10_SajatToke') +
theme_igray()
Sys.sleep(2)
p12 <- ggplot(company_list_2) +
geom_histogram(aes(x = log10(MerlegEredmeny + 1)),
fill = 'green',
position = 'identity',
binwidth = 0.3,
alpha = I(0.7)) +
ggtitle('Histogram for log10_MerlegEredmeny') +
facet_grid(BankiUgyfel ~ ., scales = 'free', labeller = labeller(BankiUgyfel = c('TRUE' = 'Banki Ugyfel',
'FALSE' = 'Not Banki Ugyfel'))) +
xlab('log10_MerlegEredmeny') +
theme_igray()
Sys.sleep(2)
p13 <- ggplot(company_list_2) +
geom_histogram(aes(x = log10(Arbevetel + 1)),
fill = 'orange',
position = 'identity',
binwidth = 0.3,
alpha = I(0.7)) +
ggtitle('Histogram for log10_Arbevetel') +
facet_grid(BankiUgyfel ~ ., scales = 'free', labeller = labeller(BankiUgyfel = c('TRUE' = 'Banki Ugyfel',
'FALSE' = 'Not Banki Ugyfel'))) +
xlab('log10_Arbevetel') +
theme_igray()
Sys.sleep(2)
p14 <- ggplot(company_list_2) +
geom_histogram(aes(x = log10(AktivPrivatTulajdonos +1)),
fill = 'red',
position = 'identity',
binwidth = 0.1,
alpha = I(0.7)) +
ggtitle('Histogram for log10_AktivPrivatTulajdonos') +
facet_grid(BankiUgyfel ~ ., scales = 'free', labeller = labeller(BankiUgyfel = c('TRUE' = 'Banki Ugyfel',
'FALSE' = 'Not Banki Ugyfel'))) +
xlab('log10_AktivPrivatTulajdonos') +
theme_igray()
Sys.sleep(2)
p15 <- ggplot(company_list_2) +
geom_histogram(aes(x = AktivTulajdonosSzam),
fill = 'green',
position = 'identity',
binwidth = 0.1,
alpha = I(0.7)) +
ggtitle('Histogram for log10_AktivTulajdonosSzam') +
facet_grid(BankiUgyfel ~ ., scales = 'free', labeller = labeller(BankiUgyfel = c('TRUE' = 'Banki Ugyfel',
'FALSE' = 'Not Banki Ugyfel'))) +
xlab('log10_AktivTulajdonosSzam') +
scale_x_log10() +
theme_igray()
Sys.sleep(2)
p16 <- ggplot(company_list_2) +
geom_histogram(aes(x = TulajCegesBankiUgyfelNr),
fill = 'brown',
position = 'identity',
#binwidth = 1,
alpha = I(0.7)) +
ggtitle('Histogram for TulajCegesBankiUgyfelNr') +
facet_grid(BankiUgyfel ~ ., scales = 'free', labeller = labeller(BankiUgyfel = c('TRUE' = 'Banki Ugyfel',
'FALSE' = 'Not Banki Ugyfel'))) +
xlab('TulajCegesBankiUgyfelNr') +
theme_igray()
Sys.sleep(2)
p17 <- ggplot(company_list_2) +
geom_histogram(aes(x = log10(Cegkora + 1)),
fill = 'khaki',
position = 'identity',
binwidth = 0.2,
alpha = I(0.7)) +
ggtitle('Histogram for log10_Cegkora') +
facet_grid(BankiUgyfel ~ ., scales = 'free', labeller = labeller(BankiUgyfel = c('TRUE' = 'Banki Ugyfel',
'FALSE' = 'Not Banki Ugyfel'))) +
xlab('log10_Cegkora') +
theme_igray()
Sys.sleep(2)
p18 <- ggplot(company_list_2) +
geom_histogram(aes(x = log10(LegkozFiokTavProxy + 1)),
fill = 'magenta',
position = 'identity',
binwidth = 0.2,
alpha = I(0.7)) +
ggtitle('Histogram for log10_LegkozFiokTavProxy') +
facet_grid(BankiUgyfel ~ ., scales = 'free', labeller = labeller(BankiUgyfel = c('TRUE' = 'Banki Ugyfel',
'FALSE' = 'Not Banki Ugyfel'))) +
xlab('log10_LegkozFiokTavProxy') +
theme_igray()
Sys.sleep(2)
p19 <- ggplot(company_list_2) +
geom_bar(aes(x = as.factor(Cegforma)),
fill = 'brown',
position = 'identity',
alpha = I(0.7)) +
facet_grid(BankiUgyfel ~ ., scales = 'free', labeller = labeller(BankiUgyfel = c('TRUE' = 'Banki Ugyfel',
'FALSE' = 'Not Banki Ugyfel'))) +
ggtitle('Breakdown of Cegforma') +
xlab('Cegforma') +
theme_igray()
Sys.sleep(2)
p20 <- ggplot(company_list_2) +
geom_bar(aes(x = as.factor(NAVTeruletKod)),
fill = 'purple',
position = 'identity',
alpha = I(0.7)) +
facet_grid(BankiUgyfel ~ ., scales = 'free', labeller = labeller(BankiUgyfel = c('TRUE' = 'Banki Ugyfel',
'FALSE' = 'Not Banki Ugyfel'))) +
ggtitle('Breakdown of NAVTeruletKod') +
xlab('NAVTeruletKod') +
theme_igray()
Sys.sleep(2)
p21 <- ggplot(company_list_2) +
geom_bar(aes(x = as.factor(Megye)),
fill = 'pink',
position = 'identity',
alpha = I(0.7)) +
facet_grid(BankiUgyfel ~ ., scales = 'free', labeller = labeller(BankiUgyfel = c('TRUE' = 'Banki Ugyfel',
'FALSE' = 'Not Banki Ugyfel'))) +
ggtitle('Breakdown of Megye') +
xlab('Megye') +
theme_igray()
Sys.sleep(2)
p22 <- ggplot(company_list_2) +
geom_bar(aes(x = as.factor(Varos)),
fill = 'green',
position = 'identity',
alpha = I(0.7)) +
facet_grid(BankiUgyfel ~ ., scales = 'free', labeller = labeller(BankiUgyfel = c('TRUE' = 'Banki Ugyfel',
'FALSE' = 'Not Banki Ugyfel'))) +
ggtitle('Breakdown of Varos') +
xlab('Varos') +
theme_igray()
Sys.sleep(2)
p23 <- ggplot(company_list_2) +
geom_bar(aes(x = as.factor(unreg_emp)),
fill = 'blue',
position = 'identity',
alpha = I(0.7)) +
facet_grid(BankiUgyfel ~ ., scales = 'free', labeller = labeller(BankiUgyfel = c('TRUE' = 'Banki Ugyfel',
'FALSE' = 'Not Banki Ugyfel'))) +
ggtitle('Breakdown of unreg_emp') +
xlab('unreg_emp') +
theme_igray()
Sys.sleep(2)
p24 <- ggplot(company_list_2) +
geom_bar(aes(x = as.factor(FotevekenysegText)),
fill = 'magenta',
position = 'identity',
alpha = I(0.7)) +
facet_grid(BankiUgyfel ~ ., scales = 'free', labeller = labeller(BankiUgyfel = c('TRUE' = 'Banki Ugyfel',
'FALSE' = 'Not Banki Ugyfel'))) +
ggtitle('Breakdown of FotevekenysegText') +
xlab('FotevekenysegText') +
theme_igray()
Sys.sleep(2)
p25 <- ggplot(company_list_2) +
geom_bar(aes(x = as.factor(AFAKod)),
fill = 'orange',
position = 'identity',
alpha = I(0.7)) +
facet_grid(BankiUgyfel ~ ., scales = 'free', labeller = labeller(BankiUgyfel = c('TRUE' = 'Banki Ugyfel',
'FALSE' = 'Not Banki Ugyfel'))) +
ggtitle('Breakdown of AFAKod') +
xlab('AFAKod') +
theme_igray()
Sys.sleep(2)
without plotly in one grid:
# multiplot(p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23, p24, p25, cols = 2)
transforming into interactive graph
pp11 <- ggplotly(p11)
Sys.sleep(2)
saving
file_out = paste(getwd(), '/pp11.html', sep = '')
htmlwidgets::saveWidget(as.widget(pp11), file_out)
transforming into interactive graph
pp12 <- ggplotly(p12)
Sys.sleep(2)
saving
file_out = paste(getwd(), '/pp12.html', sep = '')
htmlwidgets::saveWidget(as.widget(pp12), file_out)
transforming into interactive graph
pp13 <- ggplotly(p13)
Sys.sleep(2)
saving
file_out = paste(getwd(), '/pp13.html', sep = '')
htmlwidgets::saveWidget(as.widget(pp13), file_out)
transforming into interactive graph
pp14 <- ggplotly(p14)
Sys.sleep(2)
saving
file_out = paste(getwd(), '/pp14.html', sep = '')
htmlwidgets::saveWidget(as.widget(pp14), file_out)
transforming into interactive graph
pp15 <- ggplotly(p15)
Sys.sleep(2)
saving
file_out = paste(getwd(), '/pp15.html', sep = '')
htmlwidgets::saveWidget(as.widget(pp15), file_out)
transforming into interactive graph
pp16 <- ggplotly(p16)
Sys.sleep(2)
saving
file_out = paste(getwd(), '/pp16.html', sep = '')
htmlwidgets::saveWidget(as.widget(pp16), file_out)
transforming into interactive graph
pp17 <- ggplotly(p17)
Sys.sleep(2)
saving
file_out = paste(getwd(), '/pp17.html', sep = '')
htmlwidgets::saveWidget(as.widget(pp17), file_out)
transforming into interactive graph
pp18 <- ggplotly(p18)
Sys.sleep(2)
saving
file_out = paste(getwd(), '/pp18.html', sep = '')
htmlwidgets::saveWidget(as.widget(pp18), file_out)
transforming into interactive graph
pp19 <- ggplotly(p19)
Sys.sleep(2)
saving
file_out = paste(getwd(), '/pp19.html', sep = '')
htmlwidgets::saveWidget(as.widget(pp19), file_out)
transforming into interactive graph
pp20 <- ggplotly(p20)
Sys.sleep(2)
saving
file_out = paste(getwd(), '/pp20.html', sep = '')
htmlwidgets::saveWidget(as.widget(pp20), file_out)
transforming into interactive graph
pp21 <- ggplotly(p21)
Sys.sleep(2)
saving
file_out = paste(getwd(), '/pp21.html', sep = '')
htmlwidgets::saveWidget(as.widget(pp21), file_out)
transforming into interactive graph
pp22 <- ggplotly(p22)
Sys.sleep(2)
saving
file_out = paste(getwd(), '/pp22.html', sep = '')
htmlwidgets::saveWidget(as.widget(pp22), file_out)
transforming into interactive graph
pp23 <- ggplotly(p23)
Sys.sleep(2)
saving
file_out = paste(getwd(), '/pp23.html', sep = '')
htmlwidgets::saveWidget(as.widget(pp23), file_out)
transforming into interactive graph
pp24 <- ggplotly(p24)
Sys.sleep(2)
saving
file_out = paste(getwd(), '/pp24.html', sep = '')
htmlwidgets::saveWidget(as.widget(pp24), file_out)
transforming into interactive graph
pp25 <- ggplotly(p25)
Sys.sleep(2)
saving
file_out = paste(getwd(), '/pp25.html', sep = '')
htmlwidgets::saveWidget(as.widget(pp25), file_out)
plotting
layout(pp11, dragmode = 'pan')
Sys.sleep(2)
layout(pp12, dragmode = 'pan')
Sys.sleep(2)
layout(pp13, dragmode = 'pan')
Sys.sleep(2)
layout(pp14, dragmode = 'pan')
Sys.sleep(2)
layout(pp15, dragmode = 'pan')
Sys.sleep(2)
layout(pp16, dragmode = 'pan')
Sys.sleep(2)
layout(pp17, dragmode = 'pan')
Sys.sleep(2)
layout(pp18, dragmode = 'pan')
Sys.sleep(2)
layout(pp19, dragmode = 'pan')
Sys.sleep(2)
layout(pp20, dragmode = 'pan')
Sys.sleep(2)
layout(pp21, dragmode = 'pan')
Sys.sleep(2)
layout(pp22, dragmode = 'pan')
Sys.sleep(2)
layout(pp23, dragmode = 'pan')
Sys.sleep(2)
layout(pp24, dragmode = 'pan')
Sys.sleep(2)
layout(pp25, dragmode = 'pan')
Sys.sleep(2)
moving those plotly files into tmp folder
filez <- list.files(getwd(), pattern = '^pp\\d+\\.html$')
sapply(filez, FUN = function(eachPath) {
file.rename(from = eachPath,
to = sub(pattern = 'pp',
replacement = './_tmp/pp',
eachPath))
})
pp1.html pp10.html pp11.html pp12.html pp13.html pp14.html pp15.html pp16.html pp17.html pp18.html pp19.html pp2.html
TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
pp20.html pp21.html pp22.html pp23.html pp24.html pp25.html pp26.html pp3.html pp4.html pp5.html pp6.html pp7.html
TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
pp8.html pp9.html
TRUE TRUE
deallocating memory!!
rm(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23, p24, p25)
rm(pp1, pp2, pp3, pp4, pp5, pp6, pp7, pp8, pp9, pp10, pp11, pp12, pp13, pp14, pp15, pp16, pp17, pp18, pp19, pp20, pp21, pp22, pp23, pp24, pp25)
According to compared distributions, these may be predictor variables: SajatToke MerlegEredmeny AktivTulajdonosSzam TulajCegesBankiUgyfelNr Cegkora LegkozFiokTavProxy Cegforma NAVTeruletKod Megye Varos FotevekenysegText AFAKod
I put them and the target variable (BankiUgyfel) into a new data table and blended NAs.
I omitted the Varos variable, because I think that is too restrictive.
company_list_3 <- copy(data.table(company_list_2[!is.na(BankiUgyfel) &
!is.na(SajatToke) &
!is.na(MerlegEredmeny) &
!is.na(AktivTulajdonosSzam) &
!is.na(TulajCegesBankiUgyfelNr) &
!is.na(Cegkora) &
!is.na(LegkozFiokTavProxy) &
!is.na(Cegforma) &
!is.na(NAVTeruletKod) &
!is.na(Megye) &
!is.na(FotevekenysegText) &
!is.na(AFAKod), .(BankiUgyfel,
SajatToke,
MerlegEredmeny,
AktivTulajdonosSzam,
TulajCegesBankiUgyfelNr,
Cegkora,
LegkozFiokTavProxy,
Cegforma,
NAVTeruletKod,
Megye,
FotevekenysegText,
AFAKod)]))
making factors from text variables
company_list_3[, ':=' (NAVTeruletKod = as.factor(NAVTeruletKod), Megye = as.factor(Megye), FotevekenysegText = as.factor(FotevekenysegText), AFAKod =as.factor(AFAKod))]
pander(str(company_list_3))
Classes ‘data.table’ and 'data.frame': 433668 obs. of 12 variables:
$ BankiUgyfel : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ SajatToke : num 6.47e+07 1.36e+10 1.31e+09 3.52e+09 1.01e+09 ...
$ MerlegEredmeny : num 6.04e+07 1.79e+09 -1.90e+07 5.48e+07 0.00 ...
$ AktivTulajdonosSzam : num 17 27 7 1 1 4 1 16 118 2 ...
$ TulajCegesBankiUgyfelNr: int 0 0 0 0 0 0 0 0 0 0 ...
$ Cegkora : int 27 26 70 66 27 49 48 66 27 42 ...
$ LegkozFiokTavProxy : num 2.52 185.78 0 4.13 4.8 ...
$ Cegforma : Factor w/ 6 levels "","Bt","Egyeb",..: 1 6 6 6 6 6 6 6 1 4 ...
$ NAVTeruletKod : Factor w/ 37 levels "01","02","03",..: 30 8 32 32 30 13 30 33 31 31 ...
$ Megye : Factor w/ 21 levels "Bacs-Kiskun",..: 6 9 6 6 6 15 6 6 6 6 ...
$ FotevekenysegText : Factor w/ 22 levels "AdminisztrativEsSzolgaltatastTamogatoTevekenyseg",..: 14 14 19 16 9 14 16 16 3 10 ...
$ AFAKod : Factor w/ 9 levels "0","1","2","3",..: 3 3 3 3 3 3 3 3 3 3 ...
- attr(*, ".internal.selfref")=<externalptr>
saving
file_out = paste(getwd(), '/_tmp/company_list_3.RData', sep = '')
fwrite(company_list_3,
file = file_out,
nThread = getDTthreads())
let’s check a sample of the data
pander(company_list_3[sample(.N, 5)], split.table = 200)
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
BankiUgyfel SajatToke MerlegEredmeny AktivTulajdonosSzam TulajCegesBankiUgyfelNr Cegkora LegkozFiokTavProxy Cegforma NAVTeruletKod Megye FotevekenysegText AFAKod
------------- ----------- ---------------- --------------------- ------------------------- --------- -------------------- ---------- --------------- -------- --------------------------- --------
FALSE 0 0 1 0 7 0 41 Budapest Epitoipar 2
FALSE 3615000 -2710000 2 0 10 0 Kft 43 Budapest InformacioKommunikacio 2
FALSE 105326000 36399000 1 0 16 8.322 Kft 13 Pest Feldolgozoipar 2
FALSE -12369000 -1173000 1 0 16 71.21 Kft 10 Heves KereskedelemGepjarmujavitas 2
FALSE 4602000 -3142000 1 0 4 0 Kft 43 Budapest KereskedelemGepjarmujavitas 2
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
deallocating memory!!
rm(company_list_2)
splitting into train, test & validation subset
company_list_3 <- data.table(fread(paste(getwd(), '/_tmp/company_list_3.RData', sep = '')))
set.seed(20140407)
n <- nrow(company_list_3)
idx_train <- sample(1:n, 0.5*n)
idx_test <- sample(setdiff(1:n, idx_train), 0.25*n)
idx_valid <- sample(setdiff(setdiff(1:n, idx_train), idx_test), 0.25*n)
d_train <- company_list_3[idx_train,]
d_test <- company_list_3[idx_test,]
d_valid <- company_list_3[idx_valid,]
pandoc.header(paste('size of train subset: ', dim(d_train)[1], sep = ''))
# size of train subset: 216834
pandoc.header(paste('size of test subset: ', dim(d_test)[1], sep = ''))
# size of test subset: 108417
pandoc.header(paste('size of test subset: ', dim(d_valid)[1], sep = ''))
# size of test subset: 108417
deallocating memory
rm(company_list_3)
I chose random forest decision tree for predicting targetable companies and I used h2o since it is fast & powerful . RF is a swiss-army-knife method for classification. It means boostrapping data, building trees, aggregating with random subset of variable at each split.
computing optimal memory level for h2o
mem <- paste(as.character(round(as.numeric(system("awk '/Mem/ {print $2}' /proc/meminfo", intern=TRUE))[1]*0.8/1024/1024, 0)), 'g', sep = '')
initialize h2o Java server (R connects via REST) - setting RAM size & thread numbers to maximum available
h2o.init(max_mem_size = mem,
nthreads = -1)
Connection successful!
R is connected to the H2O cluster:
H2O cluster uptime: 5 hours 13 minutes
H2O cluster version: 3.10.4.6
H2O cluster version age: 13 days
H2O cluster name: H2O_started_from_R_sbudai_wzr283
H2O cluster total nodes: 1
H2O cluster total memory: 5.21 GB
H2O cluster total cores: 4
H2O cluster allowed cores: 4
H2O cluster healthy: TRUE
H2O Connection ip: localhost
H2O Connection port: 54321
H2O Connection proxy: NA
H2O Internal Security: FALSE
R Version: R version 3.4.0 (2017-04-21)
uploading data to H2O
dh2o_train <- as.h2o(d_train)
|
| | 0%
|
|==================================================================================================================| 100%
dh2o_test <- as.h2o(d_test)
|
| | 0%
|
|==================================================================================================================| 100%
dh2o_valid <- as.h2o(d_valid)
|
| | 0%
|
|==================================================================================================================| 100%
machine learning I played around with several settings and these seemed good enough
model <- h2o.randomForest(x = 2:ncol(dh2o_train), # predictor variables
y = 1, # target variable
training_frame = dh2o_train, # speaks for itself
model_id = 'BankiUgyfelModel', # being name of the model
nfolds = 5, # nr of folds for N-fold cross-validation
mtries = -1, # nr of variables randomly sampled as candidates at each split. sqrt(# of predictors)
ntrees = 30, # nr of trees
max_depth = 5 # maximum tree depth
)
|
| | 0%
|
|= | 1%
|
|=== | 2%
|
|=== | 3%
|
|====== | 6%
|
|=========== | 10%
|
|================= | 15%
|
|=================== | 17%
|
|==================== | 18%
|
|===================== | 18%
|
|====================== | 19%
|
|=========================== | 24%
|
|================================== | 29%
|
|====================================== | 33%
|
|======================================= | 34%
|
|======================================== | 35%
|
|========================================= | 36%
|
|========================================== | 37%
|
|=============================================== | 41%
|
|=================================================== | 44%
|
|======================================================= | 48%
|
|========================================================= | 50%
|
|========================================================== | 51%
|
|============================================================ | 52%
|
|============================================================ | 53%
|
|=============================================================== | 56%
|
|==================================================================== | 59%
|
|========================================================================== | 65%
|
|============================================================================ | 67%
|
|============================================================================= | 68%
|
|============================================================================== | 68%
|
|=============================================================================== | 69%
|
|=================================================================================== | 73%
|
|========================================================================================= | 78%
|
|=============================================================================================== | 83%
|
|================================================================================================ | 84%
|
|===================================================================================================== | 89%
|
|========================================================================================================== | 93%
|
|=============================================================================================================== | 98%
|
|================================================================================================================= | 99%
|
|==================================================================================================================| 100%
deallocating memory
rm(d_train, d_test)
Let’s see how it performs. Our goal is to maximize model accuracy but avoiding overfitting at the same time.
Area Under the Curve preparation for plotting ROC for training dataset
dt2 <- data.table(h2o.performance(model, dh2o_train)@metrics$thresholds_and_metric_scores[18:19])
p26 <- ggplot(dt2, aes(x = fpr, y = tpr)) +
geom_point(color = 'blue',
pch = 1,
alpha = 0.7) +
coord_fixed(ratio = 1) +
xlab('False Positive Rate') +
ylab('True Positive Rate') +
ggtitle('Receiver Operating Characteristic of model on traning dataset') +
theme_igray()
saving ROC for training dataset
file_out = paste(getwd(), '/_tmp/', sep = '')
ggsave(path = file_out, filename = 'p26.png')
plotting ROC for training dataset
plot(p26)
Sys.sleep(2)
Area Under the Curve results You can also see/check/examine the data on the h2o user interface: http://localhost:54321 https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve
auc_train <- round(h2o.auc(h2o.performance(model, dh2o_train)), 5)
auc_test <- round(h2o.auc(h2o.performance(model, dh2o_test)), 5)
auc_valid <- round(h2o.auc(h2o.performance(model, dh2o_valid)), 5)
pandoc.header('Area Under the Curve results')
# Area Under the Curve results
pandoc.header('0.90-1.00 = excellent')
# 0.90-1.00 = excellent
pandoc.header('0.80-0.90 = good')
# 0.80-0.90 = good
pandoc.header('0.70-0.80 = fair')
# 0.70-0.80 = fair
pandoc.header('0.60-0.70 = poor')
# 0.60-0.70 = poor
pandoc.header('0.50-0.60 = fail')
# 0.50-0.60 = fail
pandoc.header('')
#
pandoc.header('Our AUC is excellent for all the train, test and validation dataset. In addition they are very close to each other which means no overfitting!!')
# Our AUC is excellent for all the train, test and validation dataset. In addition they are very close to each other which means no overfitting!!
pandoc.header(paste('training dataset AUC: ', auc_train, sep = ''))
# training dataset AUC: 0.91347
pandoc.header(paste('testing dataset AUC: ', auc_test, sep = ''))
# testing dataset AUC: 0.90053
pandoc.header(paste('validation dataset AUC: ', auc_valid, sep = ''))
# validation dataset AUC: 0.90433
Let see the confusion matrices You can also see/check/examine the data on the h2o user interface: http://localhost:54321 https://en.wikipedia.org/wiki/Confusion_matrix
pandoc.header(' ')
#
pander('The number of missclassified records are below 3% in each dataset')
The number of missclassified records are below 3% in each dataset
pandoc.header(' ')
#
pandoc.header('confusion matrix of training dataset')
# confusion matrix of training dataset
pandoc.table(h2o.confusionMatrix(model, dh2o_train))
------------------------------------------------
FALSE TRUE Error Rate
------------ ------- ------ ------- ------------
**FALSE** 210883 3594 0.01676 =3594/214477
**TRUE** 1036 1321 0.4395 =1036/2357
**Totals** 211919 4915 0.02135 =4630/216834
------------------------------------------------
pandoc.header('confusion matrix of testing dataset')
# confusion matrix of testing dataset
pandoc.table(h2o.confusionMatrix(model, dh2o_test))
------------------------------------------------
FALSE TRUE Error Rate
------------ ------- ------ ------- ------------
**FALSE** 104862 2309 0.02155 =2309/107171
**TRUE** 487 759 0.3909 =487/1246
**Totals** 105349 3068 0.02579 =2796/108417
------------------------------------------------
pandoc.header('confusion matrix of validation dataset')
# confusion matrix of validation dataset
pandoc.table(h2o.confusionMatrix(model, dh2o_valid))
------------------------------------------------
FALSE TRUE Error Rate
------------ ------- ------ ------- ------------
**FALSE** 105286 1934 0.01804 =1934/107220
**TRUE** 510 687 0.4261 =510/1197
**Totals** 105796 2621 0.02254 =2444/108417
------------------------------------------------
all metrics of the model You can also see/check/examine the data on the h2o user interface: http://localhost:54321
pandoc.header(' ')
#
pandoc.header('all metrics of the model for training dataset')
# all metrics of the model for training dataset
print(h2o.performance(model, dh2o_train))
H2OBinomialMetrics: drf
MSE: 0.009096576
RMSE: 0.09537597
LogLoss: 0.03870714
Mean Per-Class Error: 0.2281494
AUC: 0.9134653
Gini: 0.8269305
Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
FALSE TRUE Error Rate
FALSE 210883 3594 0.016757 =3594/214477
TRUE 1036 1321 0.439542 =1036/2357
Totals 211919 4915 0.021353 =4630/216834
Maximum Metrics: Maximum metrics at their respective thresholds
metric threshold value idx
1 max f1 0.144500 0.363311 123
2 max f2 0.130193 0.494070 140
3 max f0point5 0.195928 0.319186 76
4 max accuracy 0.298412 0.989171 15
5 max precision 0.464822 1.000000 0
6 max recall 0.001783 1.000000 397
7 max specificity 0.464822 1.000000 0
8 max absolute_mcc 0.136158 0.391695 136
9 max min_per_class_accuracy 0.008103 0.831141 313
10 max mean_per_class_accuracy 0.011004 0.855092 295
Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
pandoc.header(' ')
#
pandoc.header('all metrics of the model for test dataset')
# all metrics of the model for test dataset
print(h2o.performance(model, dh2o_test))
H2OBinomialMetrics: drf
MSE: 0.009727923
RMSE: 0.09863023
LogLoss: 0.04162846
Mean Per-Class Error: 0.2061979
AUC: 0.9005305
Gini: 0.801061
Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
FALSE TRUE Error Rate
FALSE 104862 2309 0.021545 =2309/107171
TRUE 487 759 0.390851 =487/1246
Totals 105349 3068 0.025789 =2796/108417
Maximum Metrics: Maximum metrics at their respective thresholds
metric threshold value idx
1 max f1 0.138961 0.351878 120
2 max f2 0.093894 0.482578 152
3 max f0point5 0.193055 0.295069 65
4 max accuracy 0.347825 0.988526 1
5 max precision 0.347825 0.750000 1
6 max recall 0.001859 1.000000 396
7 max specificity 0.432264 0.999991 0
8 max absolute_mcc 0.129148 0.380771 129
9 max min_per_class_accuracy 0.008047 0.815409 309
10 max mean_per_class_accuracy 0.012854 0.846518 284
Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
pandoc.header(' ')
#
pandoc.header('all metrics of the model for validation dataset')
# all metrics of the model for validation dataset
print(h2o.performance(model, dh2o_valid))
H2OBinomialMetrics: drf
MSE: 0.009277894
RMSE: 0.09632182
LogLoss: 0.03970814
Mean Per-Class Error: 0.2220514
AUC: 0.9043251
Gini: 0.8086503
Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
FALSE TRUE Error Rate
FALSE 105286 1934 0.018038 =1934/107220
TRUE 510 687 0.426065 =510/1197
Totals 105796 2621 0.022543 =2444/108417
Maximum Metrics: Maximum metrics at their respective thresholds
metric threshold value idx
1 max f1 0.142376 0.359874 117
2 max f2 0.136831 0.490710 127
3 max f0point5 0.194216 0.312965 62
4 max accuracy 0.352701 0.988969 2
5 max precision 0.352701 0.666667 2
6 max recall 0.001865 1.000000 396
7 max specificity 0.401867 0.999991 0
8 max absolute_mcc 0.138420 0.390136 125
9 max min_per_class_accuracy 0.008106 0.824561 309
10 max mean_per_class_accuracy 0.040825 0.850574 218
Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
deallocating memory
rm(p26)
saving model in binary
file_out = paste(getwd(), '/_results/', sep = '')
h2o.saveModel(model, path = file_out)
[1] "/home/sbudai/Documents/projects/GitHub/Client_Acquisition_for_a_Bank/_results/BankiUgyfelModel"
saving model scoring in POJO
file_out = paste(getwd(), '/_results', sep = '')
h2o.download_pojo(model, path = file_out, get_jar = TRUE)
[1] "BankiUgyfelModel.java"
get fitted values of the validation dataset (public_test_scored.csv)
BankiUgyfelModel.fit <- as.data.table(h2o.predict(object = model,
newdata = dh2o_valid))[, 2:3]
|
| | 0%
|
|==================================================================================================================| 100%
colnames(BankiUgyfelModel.fit) <- c('p0', 'p1')
thrhd <- h2o.find_threshold_by_max_metric( h2o.performance(model, dh2o_valid), 'f1')
BankiUgyfelModel.fit[, predict := FALSE]
BankiUgyfelModel.fit[p1 >= thrhd, predict := TRUE]
scored_result <- cbind(d_valid, BankiUgyfelModel.fit)
pander(scored_result[sample(.N, 5)], split.table = 200)
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------
BankiUgyfel SajatToke MerlegEredmeny AktivTulajdonosSzam TulajCegesBankiUgyfelNr Cegkora LegkozFiokTavProxy Cegforma NAVTeruletKod Megye
------------- ----------- ---------------- --------------------- ------------------------- --------- -------------------- ---------- --------------- --------------------
FALSE 3637000 1e+05 2 0 7 59.5 Kft 2 Baranya
FALSE 9910000 -1696000 2 0 19 16.8 Kft 13 Pest
FALSE 5828000 828000 1 0 1 4.132 Rt 43 Budapest
FALSE -2632000 -515000 2 0 3 20.39 Bt 13 Pest
FALSE 79000 0 2 0 15 132.4 Bt 5 Borsod-Abauj-Zemplen
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Table: Table continues below
----------------------------------------------------------------------
FotevekenysegText AFAKod p0 p1 predict
----------------------------------- -------- ------ -------- ---------
Ingatlanugyletek 1 0.9792 0.02077 FALSE
KereskedelemGepjarmujavitas 2 0.9958 0.004166 FALSE
SzakmaiTudomanyosMuszakiTevekenyseg 2 0.9967 0.00325 FALSE
Epitoipar 1 0.9972 0.002771 FALSE
NemIsmert 3 0.9973 0.002712 FALSE
----------------------------------------------------------------------
saving predicted result set (based on validation data)
file_out = paste(getwd(), '/_results/scored_result.RData', sep = '')
fwrite(scored_result,
file = file_out,
nThread = getDTthreads())
disconnecting from h2o
h2o.shutdown(prompt = FALSE)
deallocating memory!
rm(list = ls())